diff --git a/python/pyabacus/CONTRIBUTING.md b/python/pyabacus/CONTRIBUTING.md
index b5d7728eae..dedd981e44 100644
--- a/python/pyabacus/CONTRIBUTING.md
+++ b/python/pyabacus/CONTRIBUTING.md
@@ -190,8 +190,10 @@ list(APPEND _diago
     ${HSOLVER_PATH}/diag_const_nums.cpp
     ${HSOLVER_PATH}/diago_iter_assist.cpp
     ${HSOLVER_PATH}/kernels/dngvd_op.cpp
+    ${HSOLVER_PATH}/kernels/bpcg_kernel_op.cpp
     ${BASE_PATH}/kernels/math_kernel_op.cpp
-    ${BASE_PATH}/kernels/math_op.cpp
+    ${BASE_PATH}/kernels/math_kernel_op_vec.cpp
+    ${BASE_PATH}/kernels/math_ylm_op.cpp
     ${BASE_PATH}/module_device/device.cpp
     ${BASE_PATH}/module_device/memory_op.cpp
     ${PSI_PATH}/psi.cpp
diff --git a/python/pyabacus/src/ModuleBase/CMakeLists.txt b/python/pyabacus/src/ModuleBase/CMakeLists.txt
index 1c2d9a728b..e89a065147 100644
--- a/python/pyabacus/src/ModuleBase/CMakeLists.txt
+++ b/python/pyabacus/src/ModuleBase/CMakeLists.txt
@@ -1,7 +1,8 @@
 list(APPEND pymodule_base
     ${PROJECT_SOURCE_DIR}/src/ModuleBase/py_base_math.cpp
-    ${BASE_PATH}/kernels/math_op.cpp
+    ${BASE_PATH}/kernels/math_ylm_op.cpp
     ${BASE_PATH}/kernels/math_kernel_op.cpp
+    ${BASE_PATH}/kernels/math_kernel_op_vec.cpp
     ${BASE_PATH}/module_device/memory_op.cpp
     ${BASE_PATH}/module_device/device.cpp
     )
diff --git a/python/pyabacus/src/ModuleNAO/CMakeLists.txt b/python/pyabacus/src/ModuleNAO/CMakeLists.txt
index 5e86604adc..b9b7fe9935 100644
--- a/python/pyabacus/src/ModuleNAO/CMakeLists.txt
+++ b/python/pyabacus/src/ModuleNAO/CMakeLists.txt
@@ -13,8 +13,9 @@ list(APPEND _naos
     ${NAO_PATH}/two_center_integrator.cpp
     ${NAO_PATH}/two_center_table.cpp
     # dependency
-    ${ABACUS_SOURCE_DIR}/module_base/kernels/math_op.cpp
+    ${ABACUS_SOURCE_DIR}/module_base/kernels/math_ylm_op.cpp
     ${ABACUS_SOURCE_DIR}/module_base/kernels/math_kernel_op.cpp
+    ${ABACUS_SOURCE_DIR}/module_base/kernels/math_kernel_op_vec.cpp
     # ${ABACUS_SOURCE_DIR}/module_psi/kernels/psi_memory_op.cpp
     ${ABACUS_SOURCE_DIR}/module_base/module_device/memory_op.cpp
     ${ABACUS_SOURCE_DIR}/module_base/module_device/device.cpp
diff --git a/python/pyabacus/src/hsolver/CMakeLists.txt b/python/pyabacus/src/hsolver/CMakeLists.txt
index 4bd0153b48..7c9f216870 100644
--- a/python/pyabacus/src/hsolver/CMakeLists.txt
+++ b/python/pyabacus/src/hsolver/CMakeLists.txt
@@ -10,9 +10,11 @@ list(APPEND _diago
 
 
     ${HSOLVER_PATH}/kernels/dngvd_op.cpp
+    ${HSOLVER_PATH}/kernels/bpcg_kernel_op.cpp
     # dependency
     ${BASE_PATH}/kernels/math_kernel_op.cpp
-    ${BASE_PATH}/kernels/math_op.cpp
+    ${BASE_PATH}/kernels/math_kernel_op_vec.cpp
+    ${BASE_PATH}/kernels/math_ylm_op.cpp
     ${BASE_PATH}/module_device/device.cpp
     ${BASE_PATH}/module_device/memory_op.cpp
     
diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
index 5acce9103f..43e8fcdc16 100644
--- a/source/CMakeLists.txt
+++ b/source/CMakeLists.txt
@@ -36,6 +36,7 @@ list(APPEND device_srcs
   module_hamilt_pw/hamilt_stodft/kernels/hpsi_norm_op.cpp
   module_basis/module_pw/kernels/pw_op.cpp
   module_hsolver/kernels/dngvd_op.cpp
+  module_hsolver/kernels/bpcg_kernel_op.cpp
   module_elecstate/kernels/elecstate_op.cpp
 
   # module_psi/kernels/psi_memory_op.cpp
@@ -44,13 +45,14 @@ list(APPEND device_srcs
   module_base/module_device/device.cpp
   module_base/module_device/memory_op.cpp
   module_base/kernels/math_kernel_op.cpp
+  module_base/kernels/math_kernel_op_vec.cpp
 
   module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
   module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp
   module_hamilt_pw/hamilt_pwdft/kernels/onsite_op.cpp
   module_hamilt_pw/hamilt_pwdft/kernels/wf_op.cpp
   module_hamilt_pw/hamilt_pwdft/kernels/vnl_op.cpp
-  module_base/kernels/math_op.cpp
+  module_base/kernels/math_ylm_op.cpp
   module_hamilt_general/module_xc/kernels/xc_functional_op.cpp
 )
 
@@ -64,6 +66,7 @@ if(USE_CUDA)
     module_hamilt_pw/hamilt_pwdft/kernels/cuda/onsite_op.cu
     module_basis/module_pw/kernels/cuda/pw_op.cu
     module_hsolver/kernels/cuda/dngvd_op.cu
+    module_hsolver/kernels/cuda/bpcg_kernel_op.cu
     module_elecstate/kernels/cuda/elecstate_op.cu
 
     # module_psi/kernels/cuda/memory_op.cu
@@ -73,8 +76,9 @@ if(USE_CUDA)
     module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu
     module_hamilt_pw/hamilt_pwdft/kernels/cuda/wf_op.cu
     module_hamilt_pw/hamilt_pwdft/kernels/cuda/vnl_op.cu
-    module_base/kernels/cuda/math_op.cu
+    module_base/kernels/cuda/math_ylm_op.cu
     module_base/kernels/cuda/math_kernel_op.cu
+    module_base/kernels/cuda/math_kernel_op_vec.cu
     module_hamilt_general/module_xc/kernels/cuda/xc_functional_op.cu
   )
 endif()
@@ -89,6 +93,7 @@ if(USE_ROCM)
     module_hamilt_pw/hamilt_stodft/kernels/rocm/hpsi_norm_op.hip.cu
     module_basis/module_pw/kernels/rocm/pw_op.hip.cu
     module_hsolver/kernels/rocm/dngvd_op.hip.cu
+    module_hsolver/kernels/rocm/bpcg_kernel_op.hip.cu
     module_elecstate/kernels/rocm/elecstate_op.hip.cu
 
     # module_psi/kernels/rocm/memory_op.hip.cu
@@ -99,7 +104,8 @@ if(USE_ROCM)
     module_hamilt_pw/hamilt_pwdft/kernels/rocm/wf_op.hip.cu
     module_hamilt_pw/hamilt_pwdft/kernels/rocm/vnl_op.hip.cu
     module_base/kernels/rocm/math_kernel_op.hip.cu
-    module_base/kernels/rocm/math_op.hip.cu
+    module_base/kernels/rocm/math_kernel_op.hip_vec.cu
+    module_base/kernels/rocm/math_ylm_op.hip.cu
     module_hamilt_general/module_xc/kernels/rocm/xc_functional_op.hip.cu
   )
 endif()
diff --git a/source/Makefile.Objects b/source/Makefile.Objects
index f15bde444f..ffb49d2c6d 100644
--- a/source/Makefile.Objects
+++ b/source/Makefile.Objects
@@ -146,8 +146,9 @@ OBJS_BASE=abfs-vector3_order.o\
     math_ylmreal.o\
     math_bspline.o\
     math_chebyshev.o\
-    math_op.o\
+    math_ylm_op.o\
     math_kernel_op.o\
+    math_kernel_op_vec.o\
     mathzone_add1.o\
     matrix.o\
     matrix3.o\
@@ -349,6 +350,7 @@ OBJS_HSOLVER=diago_cg.o\
     hsolver_pw_sdft.o\
     diago_iter_assist.o\
     dngvd_op.o\
+    bpcg_kernel_op.o\
     diag_const_nums.o\
     diag_hs_para.o\
     diago_pxxxgvx.o\
diff --git a/source/module_base/kernels/cuda/math_kernel_op.cu b/source/module_base/kernels/cuda/math_kernel_op.cu
index 7227df630d..3a42767925 100644
--- a/source/module_base/kernels/cuda/math_kernel_op.cu
+++ b/source/module_base/kernels/cuda/math_kernel_op.cu
@@ -8,22 +8,6 @@
 #include <thrust/complex.h>
 #include <thrust/execution_policy.h>
 #include <thrust/inner_product.h>
-
-namespace ModuleBase
-{
-const int warp_size = 32;
-// const unsigned int full_mask = 0xffffffff;
-const int thread_per_block = 256;
-}
-
-template <>
-struct GetTypeReal<thrust::complex<float>> {
-    using type = float; /**< The return type specialization for std::complex<double>. */
-};
-template <>
-struct GetTypeReal<thrust::complex<double>> {
-    using type = double; /**< The return type specialization for std::complex<double>. */
-};
 namespace ModuleBase {
 template <typename T>
 struct GetTypeThrust {
@@ -42,12 +26,10 @@ struct GetTypeThrust<std::complex<double>> {
 
 static cublasHandle_t cublas_handle = nullptr;
 
-static inline
 void xdot_wrapper(const int &n, const float * x, const int &incx, const float * y, const int &incy, float &result) {
     cublasErrcheck(cublasSdot(cublas_handle, n, x, incx, y, incy, &result));
 }
 
-static inline
 void xdot_wrapper(const int &n, const double * x, const int &incx, const double * y, const int &incy, double &result) {
     cublasErrcheck(cublasDdot(cublas_handle, n, x, incx, y, incy, &result));
 }
@@ -70,310 +52,57 @@ void destoryBLAShandle(){
 //     for (int offset = 16; offset > 0; offset >>= 1)
 //         val += __shfl_down_sync(full_mask, val, offset);
 // }
-
-template <typename Real>
-__global__ void line_minimize_with_block(
-        thrust::complex<Real>* grad,
-        thrust::complex<Real>* hgrad,
-        thrust::complex<Real>* psi,
-        thrust::complex<Real>* hpsi,
-        const int n_basis,
-        const int n_basis_max)
+template <>
+void scal_op<float, base_device::DEVICE_GPU>::operator()(const int& N,
+                                                         const std::complex<float>* alpha,
+                                                         std::complex<float>* X,
+                                                         const int& incx)
 {
-    int band_idx = blockIdx.x; // band_idx
-    int tid = threadIdx.x; // basis_idx
-    int item = 0;
-    Real epsilo_0 = 0.0, epsilo_1 = 0.0, epsilo_2 = 0.0;
-    Real theta = 0.0, cos_theta = 0.0, sin_theta = 0.0;
-    __shared__ Real data[thread_per_block * 3];
-
-    data[tid] = 0;
-
-    for (int basis_idx = tid; basis_idx < n_basis; basis_idx += thread_per_block) {
-        item = band_idx * n_basis_max + basis_idx;
-        data[tid] += (grad[item] * thrust::conj(grad[item])).real();
-    }
-    __syncthreads();
-    // just do some parallel reduction in shared memory
-    for (int ii = thread_per_block >> 1; ii > warp_size; ii >>= 1) {
-        if (tid < ii) {
-            data[tid] += data[tid + ii];
-        }
-        __syncthreads();
-    }
-
-    // For threads in the same warp, it is better that they process the same work
-    // Also, __syncwarp() should be used instead of __syncthreads()
-    // Therefore we unroll the loop and ensure that the threads does the same work
-    if (tid < warp_size) {
-        data[tid] += data[tid + 32]; __syncwarp();
-        data[tid] += data[tid + 16]; __syncwarp();
-        data[tid] += data[tid + 8]; __syncwarp();
-        data[tid] += data[tid + 4]; __syncwarp();
-        data[tid] += data[tid + 2]; __syncwarp();
-        data[tid] += data[tid + 1]; __syncwarp();
-    }
-
-    __syncthreads();
-
-    Real norm = 1.0 / sqrt(data[0]);
-    __syncthreads();
-
-    data[tid] = 0;
-    data[thread_per_block + tid] = 0;
-    data[2 * thread_per_block + tid] = 0;
-    for (int basis_idx = tid; basis_idx < n_basis; basis_idx += thread_per_block) {
-        item = band_idx * n_basis_max + basis_idx;
-        grad[item] *= norm;
-        hgrad[item] *= norm;
-        data[tid] += (hpsi[item] * thrust::conj(psi[item])).real();
-        data[thread_per_block + tid] += (grad[item] * thrust::conj(hpsi[item])).real();
-        data[2 * thread_per_block + tid] += (grad[item] * thrust::conj(hgrad[item])).real();
-    }
-    __syncthreads();
-
-    // just do some parallel reduction in shared memory
-    for (int ii = thread_per_block >> 1; ii > warp_size; ii >>= 1) {
-        if (tid < ii) {
-            data[tid] += data[tid + ii];
-            data[thread_per_block + tid] += data[thread_per_block + tid + ii];
-            data[2 * thread_per_block + tid] += data[2 * thread_per_block + tid + ii];
-        }
-        __syncthreads();
-    }
-    if (tid < warp_size) {
-        data[tid] += data[tid + 32]; __syncwarp();
-        data[tid] += data[tid + 16]; __syncwarp();
-        data[tid] += data[tid + 8]; __syncwarp();
-        data[tid] += data[tid + 4]; __syncwarp();
-        data[tid] += data[tid + 2]; __syncwarp();
-        data[tid] += data[tid + 1]; __syncwarp();
-        data[thread_per_block + tid] += data[thread_per_block + tid + 32]; __syncwarp();
-        data[thread_per_block + tid] += data[thread_per_block + tid + 16]; __syncwarp();
-        data[thread_per_block + tid] += data[thread_per_block + tid + 8]; __syncwarp();
-        data[thread_per_block + tid] += data[thread_per_block + tid + 4]; __syncwarp();
-        data[thread_per_block + tid] += data[thread_per_block + tid + 2]; __syncwarp();
-        data[thread_per_block + tid] += data[thread_per_block + tid + 1]; __syncwarp();
-        data[2 * thread_per_block + tid] += data[2 * thread_per_block + tid + 32]; __syncwarp();
-        data[2 * thread_per_block + tid] += data[2 * thread_per_block + tid + 16]; __syncwarp();
-        data[2 * thread_per_block + tid] += data[2 * thread_per_block + tid + 8]; __syncwarp();
-        data[2 * thread_per_block + tid] += data[2 * thread_per_block + tid + 4]; __syncwarp();
-        data[2 * thread_per_block + tid] += data[2 * thread_per_block + tid + 2]; __syncwarp();
-        data[2 * thread_per_block + tid] += data[2 * thread_per_block + tid + 1]; __syncwarp();
-    }
-    __syncthreads();
-    epsilo_0 = data[0];
-    epsilo_1 = data[thread_per_block];
-    epsilo_2 = data[2 * thread_per_block];
-
-    theta = 0.5 * abs(atan(2 * epsilo_1/(epsilo_0 - epsilo_2)));
-    cos_theta = cos(theta);
-    sin_theta = sin(theta);
-    for (int basis_idx = tid; basis_idx < n_basis; basis_idx += thread_per_block) {
-        item = band_idx * n_basis_max + basis_idx;
-        psi [item] = psi [item] * cos_theta + grad [item] * sin_theta;
-        hpsi[item] = hpsi[item] * cos_theta + hgrad[item] * sin_theta;
-    }
+    cublasErrcheck(cublasCscal(cublas_handle, N, (float2*)alpha, (float2*)X, incx));
 }
 
-template <typename Real>
-__global__ void calc_grad_with_block(
-        const Real* prec,
-        Real* err,
-        Real* beta,
-        thrust::complex<Real>* psi,
-        thrust::complex<Real>* hpsi,
-        thrust::complex<Real>* grad,
-        thrust::complex<Real>* grad_old,
-        const int n_basis,
-        const int n_basis_max)
+template <>
+void scal_op<double, base_device::DEVICE_GPU>::operator()(const int& N,
+                                                          const std::complex<double>* alpha,
+                                                          std::complex<double>* X,
+                                                          const int& incx)
 {
-    int band_idx = blockIdx.x; // band_idx
-    int tid = threadIdx.x; // basis_idx
-    int item = 0;
-    Real err_st = 0.0;
-    Real beta_st = 0.0;
-    Real epsilo = 0.0;
-    Real grad_2 = 0.0;
-    thrust::complex<Real> grad_1 = {0, 0};
-    __shared__ Real data[thread_per_block * 2];
-
-    // Init shared memory
-    data[tid] = 0;
-
-    for (int basis_idx = tid; basis_idx < n_basis; basis_idx += thread_per_block) {
-        item = band_idx * n_basis_max + basis_idx;
-        data[tid] += (psi[item] * thrust::conj(psi[item])).real();
-    }
-    __syncthreads();
-    // just do some parallel reduction in shared memory
-    for (int ii = thread_per_block >> 1; ii > warp_size; ii >>= 1) {
-        if (tid < ii) {
-            data[tid] += data[tid + ii];
-        }
-        __syncthreads();
-    }
-
-    if (tid < warp_size) {
-        data[tid] += data[tid + 32]; __syncwarp();
-        data[tid] += data[tid + 16]; __syncwarp();
-        data[tid] += data[tid + 8]; __syncwarp();
-        data[tid] += data[tid + 4]; __syncwarp();
-        data[tid] += data[tid + 2]; __syncwarp();
-        data[tid] += data[tid + 1]; __syncwarp();
-    }
-
-    __syncthreads();
-
-    Real norm = 1.0 / sqrt(data[0]);
-    __syncthreads();
-
-    data[tid] = 0;
-    for (int basis_idx = tid; basis_idx < n_basis; basis_idx += thread_per_block) {
-        item = band_idx * n_basis_max + basis_idx;
-        psi[item] *= norm;
-        hpsi[item] *= norm;
-        data[tid] += (hpsi[item] * thrust::conj(psi[item])).real();
-    }
-    __syncthreads();
-
-    // just do some parallel reduction in shared memory
-    for (int ii = thread_per_block >> 1; ii > warp_size; ii >>= 1) {
-        if (tid < ii) {
-            data[tid] += data[tid + ii];
-        }
-        __syncthreads();
-    }
-
-    if (tid < warp_size) {
-        data[tid] += data[tid + 32]; __syncwarp();
-        data[tid] += data[tid + 16]; __syncwarp();
-        data[tid] += data[tid + 8]; __syncwarp();
-        data[tid] += data[tid + 4]; __syncwarp();
-        data[tid] += data[tid + 2]; __syncwarp();
-        data[tid] += data[tid + 1]; __syncwarp();
-    }
-
-    __syncthreads();
-    epsilo = data[0];
-    __syncthreads();
-
-    data[tid] = 0;
-    data[thread_per_block + tid] = 0;
-    for (int basis_idx = tid; basis_idx < n_basis; basis_idx += thread_per_block) {
-        item = band_idx * n_basis_max + basis_idx;
-        grad_1 = hpsi[item] - epsilo * psi[item];
-        grad_2 = thrust::norm(grad_1);
-        data[tid] += grad_2;
-        data[thread_per_block + tid] += grad_2 / prec[basis_idx];
-    }
-    __syncthreads();
-
-    // just do some parallel reduction in shared memory
-    for (int ii = thread_per_block >> 1; ii > warp_size; ii >>= 1) {
-        if (tid < ii) {
-            data[tid] += data[tid + ii];
-            data[thread_per_block + tid] += data[thread_per_block + tid + ii];
-        }
-        __syncthreads();
-    }
-
-    if (tid < warp_size) {
-        data[tid] += data[tid + 32]; __syncwarp();
-        data[tid] += data[tid + 16]; __syncwarp();
-        data[tid] += data[tid + 8]; __syncwarp();
-        data[tid] += data[tid + 4]; __syncwarp();
-        data[tid] += data[tid + 2]; __syncwarp();
-        data[tid] += data[tid + 1]; __syncwarp();
-        data[thread_per_block + tid] += data[thread_per_block + tid + 32]; __syncwarp();
-        data[thread_per_block + tid] += data[thread_per_block + tid + 16]; __syncwarp();
-        data[thread_per_block + tid] += data[thread_per_block + tid + 8]; __syncwarp();
-        data[thread_per_block + tid] += data[thread_per_block + tid + 4]; __syncwarp();
-        data[thread_per_block + tid] += data[thread_per_block + tid + 2]; __syncwarp();
-        data[thread_per_block + tid] += data[thread_per_block + tid + 1]; __syncwarp();
-    }
-
-    __syncthreads();
-    err_st = data[0];
-    beta_st = data[thread_per_block];
-    for (int basis_idx = tid; basis_idx < n_basis; basis_idx += thread_per_block) {
-        item = band_idx * n_basis_max + basis_idx;
-        grad_1 = hpsi[item] - epsilo * psi[item];
-        grad[item] = -grad_1 / prec[basis_idx] + beta_st / beta[band_idx] * grad_old[item];
-    }
-
-    __syncthreads();
-    if (tid == 0) {
-        beta[band_idx] = beta_st;
-        err[band_idx] = sqrt(err_st);
-    }
+    cublasErrcheck(cublasZscal(cublas_handle, N, (double2*)alpha, (double2*)X, incx));
 }
 
-// Define the CUDA kernel:
-template <typename T>
-__global__ void vector_div_constant_kernel(
-    const int size,
-    T* result,
-    const T* vector,
-    const typename GetTypeReal<T>::type constant)
+template <>
+void axpy_op<double, base_device::DEVICE_GPU>::operator()(const int& N,
+                                                          const double* alpha,
+                                                          const double* X,
+                                                          const int& incX,
+                                                          double* Y,
+                                                          const int& incY)
 {
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i < size)
-    {
-        result[i] = vector[i] / constant;
-    }
+    cublasErrcheck(cublasDaxpy(cublas_handle, N, alpha, X, incX, Y, incY));
 }
 
-template <typename T>
-__global__ void vector_mul_vector_kernel(const int size,
-                                         T* result,
-                                         const T* vector1,
-                                         const typename GetTypeReal<T>::type* vector2,
-                                         const bool add)
+template <>
+void axpy_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int& N,
+                                                                       const std::complex<float>* alpha,
+                                                                       const std::complex<float>* X,
+                                                                       const int& incX,
+                                                                       std::complex<float>* Y,
+                                                                       const int& incY)
 {
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i < size)
-    {
-        if (add)
-        {
-            result[i] += vector1[i] * vector2[i];
-        }
-        else
-        {
-            result[i] = vector1[i] * vector2[i];
-        }
-    }
+    cublasErrcheck(cublasCaxpy(cublas_handle, N, (float2*)alpha, (float2*)X, incX, (float2*)Y, incY));
 }
 
-template <typename T>
-__global__ void vector_div_vector_kernel(
-    const int size,
-    T* result,
-    const T* vector1,
-    const typename GetTypeReal<T>::type* vector2)
+template <>
+void axpy_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(const int& N,
+                                                                        const std::complex<double>* alpha,
+                                                                        const std::complex<double>* X,
+                                                                        const int& incX,
+                                                                        std::complex<double>* Y,
+                                                                        const int& incY)
 {
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i < size)
-    {
-        result[i] = vector1[i] / vector2[i];
-    }
+    cublasErrcheck(cublasZaxpy(cublas_handle, N, (double2*)alpha, (double2*)X, incX, (double2*)Y, incY));
 }
 
-template <typename T, typename Real>
-__global__ void constantvector_addORsub_constantVector_kernel(
-    const int size,
-    T* result,
-    const T* vector1,
-    const Real constant1,
-    const T* vector2,
-    const Real constant2)
-{
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i < size)
-    {
-        result[i] = vector1[i] * constant1 + vector2[i] * constant2;
-    }
-}
 
 template <typename T>
 __global__ void matrix_transpose_kernel(
@@ -404,303 +133,6 @@ __global__ void matrix_copy_kernel(const int n1, const int n2, const T* A, const
     }
 }
 
-template <typename T>
-void line_minimize_with_block_op<T, base_device::DEVICE_GPU>::operator()(T* grad_out,
-                                                                         T* hgrad_out,
-                                                                         T* psi_out,
-                                                                         T* hpsi_out,
-                                                                         const int& n_basis,
-                                                                         const int& n_basis_max,
-                                                                         const int& n_band)
-{
-    auto A = reinterpret_cast<thrust::complex<Real>*>(grad_out);
-    auto B = reinterpret_cast<thrust::complex<Real>*>(hgrad_out);
-    auto C = reinterpret_cast<thrust::complex<Real>*>(psi_out);
-    auto D = reinterpret_cast<thrust::complex<Real>*>(hpsi_out);
-
-    line_minimize_with_block<Real><<<n_band, thread_per_block>>>(
-            A, B, C, D,
-            n_basis, n_basis_max);
-
-    cudaCheckOnDebug();
-}
-
-template <typename T>
-void calc_grad_with_block_op<T, base_device::DEVICE_GPU>::operator()(const Real* prec_in,
-                                                                     Real* err_out,
-                                                                     Real* beta_out,
-                                                                     T* psi_out,
-                                                                     T* hpsi_out,
-                                                                     T* grad_out,
-                                                                     T* grad_old_out,
-                                                                     const int& n_basis,
-                                                                     const int& n_basis_max,
-                                                                     const int& n_band)
-{
-    auto A = reinterpret_cast<thrust::complex<Real>*>(psi_out);
-    auto B = reinterpret_cast<thrust::complex<Real>*>(hpsi_out);
-    auto C = reinterpret_cast<thrust::complex<Real>*>(grad_out);
-    auto D = reinterpret_cast<thrust::complex<Real>*>(grad_old_out);
-
-    calc_grad_with_block<Real><<<n_band, thread_per_block>>>(
-            prec_in, err_out, beta_out,
-            A, B, C, D,
-            n_basis, n_basis_max);
-
-    cudaCheckOnDebug();
-}
-
-template <>
-double dot_real_op<double, base_device::DEVICE_GPU>::operator()(const int& dim,
-                                                                const double* psi_L,
-                                                                const double* psi_R,
-                                                                const bool reduce)
-{
-    double result = 0.0;
-    xdot_wrapper(dim, psi_L, 1, psi_R, 1, result);
-    if (reduce) {
-        Parallel_Reduce::reduce_pool(result);
-    }
-    return result;
-}
-// for this implementation, please check
-// https://thrust.github.io/doc/group__transformed__reductions_ga321192d85c5f510e52300ae762c7e995.html denghui modify
-// 2022-10-03 Note that ddot_(2*dim,a,1,b,1) = REAL( zdotc_(dim,a,1,b,1) ) GPU specialization of actual computation.
-template <typename FPTYPE>
-inline FPTYPE dot_complex_wrapper(const int& dim,
-                                  const std::complex<FPTYPE>* psi_L,
-                                  const std::complex<FPTYPE>* psi_R,
-                                  const bool reduce)
-{
-    //<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-    // denghui modify 2022-10-07
-    // Note that  ddot_(2*dim,a,1,b,1) = REAL( zdotc_(dim,a,1,b,1) )
-    const FPTYPE* pL = reinterpret_cast<const FPTYPE*>(psi_L);
-    const FPTYPE* pR = reinterpret_cast<const FPTYPE*>(psi_R);
-    FPTYPE result = 0.0;
-    xdot_wrapper(dim * 2, pL, 1, pR, 1, result);
-    if (reduce) {
-        Parallel_Reduce::reduce_pool(result);
-    }
-    return result;
-}
-
-template <>
-float dot_real_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int& dim,
-                                                                            const std::complex<float>* psi_L,
-                                                                            const std::complex<float>* psi_R,
-                                                                            const bool reduce)
-{
-    return dot_complex_wrapper(dim, psi_L, psi_R, reduce);
-}
-template <>
-double dot_real_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(const int& dim,
-                                                                              const std::complex<double>* psi_L,
-                                                                              const std::complex<double>* psi_R,
-                                                                              const bool reduce)
-{
-    return dot_complex_wrapper(dim, psi_L, psi_R, reduce);
-}
-
-// vector operator: result[i] = vector[i] / constant
-template <>
-void vector_div_constant_op<double, base_device::DEVICE_GPU>::operator()(const int dim,
-                                                                         double* result,
-                                                                         const double* vector,
-                                                                         const double constant)
-{
-    // In small cases, 1024 threads per block will only utilize 17 blocks, much less than 40
-    int thread = thread_per_block;
-    int block = (dim + thread - 1) / thread;
-    vector_div_constant_kernel<double> <<<block, thread >>> (dim, result, vector, constant);
-
-    cudaCheckOnDebug();
-}
-
-// vector operator: result[i] = vector[i] / constant
-template <typename FPTYPE>
-inline void vector_div_constant_complex_wrapper(const int dim,
-                                                std::complex<FPTYPE>* result,
-                                                const std::complex<FPTYPE>* vector,
-                                                const FPTYPE constant)
-{
-    thrust::complex<FPTYPE>* result_tmp = reinterpret_cast<thrust::complex<FPTYPE>*>(result);
-    const thrust::complex<FPTYPE>* vector_tmp = reinterpret_cast<const thrust::complex<FPTYPE>*>(vector);
-
-    int thread = thread_per_block;
-    int block = (dim + thread - 1) / thread;
-    vector_div_constant_kernel<thrust::complex<FPTYPE>> <<<block, thread >>> (dim, result_tmp, vector_tmp, constant);
-
-    cudaCheckOnDebug();
-}
-template <>
-void vector_div_constant_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int dim,
-                                                                                      std::complex<float>* result,
-                                                                                      const std::complex<float>* vector,
-                                                                                      const float constant)
-{
-    vector_div_constant_complex_wrapper(dim, result, vector, constant);
-}
-template <>
-void vector_div_constant_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(
-    const int dim,
-    std::complex<double>* result,
-    const std::complex<double>* vector,
-    const double constant)
-{
-    vector_div_constant_complex_wrapper(dim, result, vector, constant);
-}
-// vector operator: result[i] = vector1[i](not complex) * vector2[i](not complex)
-template <>
-void vector_mul_vector_op<double, base_device::DEVICE_GPU>::operator()(const int& dim,
-                                                                       double* result,
-                                                                       const double* vector1,
-                                                                       const double* vector2,
-                                                                       const bool& add)
-{
-    int thread = thread_per_block;
-    int block = (dim + thread - 1) / thread;
-    vector_mul_vector_kernel<double> <<<block, thread >>> (dim, result, vector1, vector2, add);
-
-    cudaCheckOnDebug();
-}
-// vector operator: result[i] = vector1[i](complex) * vector2[i](not complex)
-template <typename FPTYPE>
-inline void vector_mul_vector_complex_wrapper(const int& dim,
-                                              std::complex<FPTYPE>* result,
-                                              const std::complex<FPTYPE>* vector1,
-                                              const FPTYPE* vector2,
-                                              const bool& add)
-{
-    thrust::complex<FPTYPE>* result_tmp = reinterpret_cast<thrust::complex<FPTYPE>*>(result);
-    const thrust::complex<FPTYPE>* vector1_tmp = reinterpret_cast<const thrust::complex<FPTYPE>*>(vector1);
-    int thread = thread_per_block;
-    int block = (dim + thread - 1) / thread;
-    vector_mul_vector_kernel<thrust::complex<FPTYPE>> <<<block, thread >>> (dim, result_tmp, vector1_tmp, vector2, add);
-
-    cudaCheckOnDebug();
-}
-template <>
-void vector_mul_vector_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int& dim,
-                                                                                    std::complex<float>* result,
-                                                                                    const std::complex<float>* vector1,
-                                                                                    const float* vector2,
-                                                                                    const bool& add)
-{
-    vector_mul_vector_complex_wrapper(dim, result, vector1, vector2, add);
-}
-template <>
-void vector_mul_vector_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(
-    const int& dim,
-    std::complex<double>* result,
-    const std::complex<double>* vector1,
-    const double* vector2,
-    const bool& add)
-{
-    vector_mul_vector_complex_wrapper(dim, result, vector1, vector2, add);
-}
-
-// vector operator: result[i] = vector1[i](not complex) / vector2[i](not complex)
-template <>
-void vector_div_vector_op<double, base_device::DEVICE_GPU>::operator()(const int& dim,
-                                                                       double* result,
-                                                                       const double* vector1,
-                                                                       const double* vector2)
-{
-    int thread = thread_per_block;
-    int block = (dim + thread - 1) / thread;
-    vector_div_vector_kernel<double> <<<block, thread >>> (dim, result, vector1, vector2);
-
-    cudaCheckOnDebug();
-}
-// vector operator: result[i] = vector1[i](complex) / vector2[i](not complex)
-template <typename FPTYPE>
-inline void vector_div_vector_complex_wrapper(const int& dim,
-                                              std::complex<FPTYPE>* result,
-                                              const std::complex<FPTYPE>* vector1,
-                                              const FPTYPE* vector2)
-{
-    thrust::complex<FPTYPE>* result_tmp = reinterpret_cast<thrust::complex<FPTYPE>*>(result);
-    const thrust::complex<FPTYPE>* vector1_tmp = reinterpret_cast<const thrust::complex<FPTYPE>*>(vector1);
-    int thread = thread_per_block;
-    int block = (dim + thread - 1) / thread;
-    vector_div_vector_kernel<thrust::complex<FPTYPE>> <<<block, thread >>> (dim, result_tmp, vector1_tmp, vector2);
-
-    cudaCheckOnDebug();
-}
-template <>
-void vector_div_vector_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int& dim,
-                                                                                    std::complex<float>* result,
-                                                                                    const std::complex<float>* vector1,
-                                                                                    const float* vector2)
-{
-    vector_div_vector_complex_wrapper(dim, result, vector1, vector2);
-}
-template <>
-void vector_div_vector_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(
-    const int& dim,
-    std::complex<double>* result,
-    const std::complex<double>* vector1,
-    const double* vector2)
-{
-    vector_div_vector_complex_wrapper(dim, result, vector1, vector2);
-}
-// vector operator: result[i] = vector1[i] * constant1 + vector2[i] * constant2
-template <typename T>
-void constantvector_addORsub_constantVector_op<T, base_device::DEVICE_GPU>::operator()(const int& dim,
-                                                                                       T* result,
-                                                                                       const T* vector1,
-                                                                                       const Real constant1,
-                                                                                       const T* vector2,
-                                                                                       const Real constant2)
-{
-    using Type = typename GetTypeThrust<T>::type;
-    using Real = typename GetTypeReal<T>::type;
-
-    auto result_tmp = reinterpret_cast<Type*>(result);
-    auto vector1_tmp = reinterpret_cast<const Type*>(vector1);
-    auto vector2_tmp = reinterpret_cast<const Type*>(vector2);
-
-    int thread = thread_per_block;
-    int block = (dim + thread - 1) / thread;
-    constantvector_addORsub_constantVector_kernel<Type, Real> <<<block, thread >>>(dim, result_tmp, vector1_tmp, constant1, vector2_tmp, constant2);
-
-    cudaCheckOnDebug();
-}
-
-template <>
-void axpy_op<double, base_device::DEVICE_GPU>::operator()(const int& N,
-                                                          const double* alpha,
-                                                          const double* X,
-                                                          const int& incX,
-                                                          double* Y,
-                                                          const int& incY)
-{
-    cublasErrcheck(cublasDaxpy(cublas_handle, N, alpha, X, incX, Y, incY));
-}
-
-template <>
-void axpy_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int& N,
-                                                                       const std::complex<float>* alpha,
-                                                                       const std::complex<float>* X,
-                                                                       const int& incX,
-                                                                       std::complex<float>* Y,
-                                                                       const int& incY)
-{
-    cublasErrcheck(cublasCaxpy(cublas_handle, N, (float2*)alpha, (float2*)X, incX, (float2*)Y, incY));
-}
-
-template <>
-void axpy_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(const int& N,
-                                                                        const std::complex<double>* alpha,
-                                                                        const std::complex<double>* X,
-                                                                        const int& incX,
-                                                                        std::complex<double>* Y,
-                                                                        const int& incY)
-{
-    cublasErrcheck(cublasZaxpy(cublas_handle, N, (double2*)alpha, (double2*)X, incX, (double2*)Y, incY));
-}
-
 cublasOperation_t judge_trans_op(bool is_complex, const char& trans, const char* name)
 {
     if (trans == 'N')
@@ -778,24 +210,6 @@ void gemv_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(const ch
     cublasErrcheck(cublasZgemv(cublas_handle, cutrans, m, n, &alpha, (cuDoubleComplex*)A, lda, (cuDoubleComplex*)X, incx, &beta, (cuDoubleComplex*)Y, incx));
 }
 
-template <>
-void scal_op<float, base_device::DEVICE_GPU>::operator()(const int& N,
-                                                         const std::complex<float>* alpha,
-                                                         std::complex<float>* X,
-                                                         const int& incx)
-{
-    cublasErrcheck(cublasCscal(cublas_handle, N, (float2*)alpha, (float2*)X, incx));
-}
-
-template <>
-void scal_op<double, base_device::DEVICE_GPU>::operator()(const int& N,
-                                                          const std::complex<double>* alpha,
-                                                          std::complex<double>* X,
-                                                          const int& incx)
-{
-    cublasErrcheck(cublasZscal(cublas_handle, N, (double2*)alpha, (double2*)X, incx));
-}
-
 template <>
 void gemm_op<float, base_device::DEVICE_GPU>::operator()(const char& transa,
                                                          const char& transb,
@@ -1026,32 +440,8 @@ void matrixCopy<std::complex<double>, base_device::DEVICE_GPU>::operator()(const
 
 
 // Explicitly instantiate functors for the types of functor registered.
-template struct dot_real_op<std::complex<float>, base_device::DEVICE_GPU>;
-template struct calc_grad_with_block_op<std::complex<float>, base_device::DEVICE_GPU>;
-template struct line_minimize_with_block_op<std::complex<float>, base_device::DEVICE_GPU>;
-template struct vector_div_constant_op<std::complex<float>, base_device::DEVICE_GPU>;
-template struct vector_mul_vector_op<float, base_device::DEVICE_GPU>;
-template struct vector_mul_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
-template struct vector_div_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
-template struct constantvector_addORsub_constantVector_op<float, base_device::DEVICE_GPU>;
-template struct constantvector_addORsub_constantVector_op<std::complex<float>, base_device::DEVICE_GPU>;
-template struct matrixCopy<std::complex<float>, base_device::DEVICE_GPU>;
 
-template struct dot_real_op<std::complex<double>, base_device::DEVICE_GPU>;
-template struct calc_grad_with_block_op<std::complex<double>, base_device::DEVICE_GPU>;
-template struct line_minimize_with_block_op<std::complex<double>, base_device::DEVICE_GPU>;
-template struct vector_div_constant_op<std::complex<double>, base_device::DEVICE_GPU>;
-template struct vector_mul_vector_op<double, base_device::DEVICE_GPU>;
-template struct vector_mul_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
-template struct vector_div_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
-template struct constantvector_addORsub_constantVector_op<double, base_device::DEVICE_GPU>;
-template struct constantvector_addORsub_constantVector_op<std::complex<double>, base_device::DEVICE_GPU>;
+template struct matrixCopy<std::complex<float>, base_device::DEVICE_GPU>;
 template struct matrixCopy<double, base_device::DEVICE_GPU>;
 template struct matrixCopy<std::complex<double>, base_device::DEVICE_GPU>;
-
-#ifdef __LCAO
-template struct dot_real_op<double, base_device::DEVICE_GPU>;
-template struct vector_div_constant_op<double, base_device::DEVICE_GPU>;
-template struct vector_div_vector_op<double, base_device::DEVICE_GPU>;
-#endif
 }  // namespace ModuleBase
diff --git a/source/module_base/kernels/cuda/math_kernel_op_vec.cu b/source/module_base/kernels/cuda/math_kernel_op_vec.cu
new file mode 100644
index 0000000000..ecc5e8b898
--- /dev/null
+++ b/source/module_base/kernels/cuda/math_kernel_op_vec.cu
@@ -0,0 +1,325 @@
+#include "module_base/kernels/math_kernel_op.h"
+
+#include <base/macros/macros.h>
+#include <thrust/complex.h>
+
+template <>
+struct GetTypeReal<thrust::complex<float>> {
+    using type = float; /**< The return type specialization for std::complex<double>. */
+};
+template <>
+struct GetTypeReal<thrust::complex<double>> {
+    using type = double; /**< The return type specialization for std::complex<double>. */
+};
+namespace ModuleBase
+{
+const int thread_per_block = 256;
+void xdot_wrapper(const int &n, const float * x, const int &incx, const float * y, const int &incy, float &result);
+void xdot_wrapper(const int &n, const double * x, const int &incx, const double * y, const int &incy, double &result);
+
+// Define the CUDA kernel:
+template <typename T>
+__global__ void vector_mul_real_kernel(const int size,
+                                       T* result,
+                                       const T* vector,
+                                       const typename GetTypeReal<T>::type constant)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < size)
+    {
+        result[i] = vector[i] * constant;
+    }
+}
+
+template <typename T>
+__global__ void vector_mul_vector_kernel(const int size,
+                                         T* result,
+                                         const T* vector1,
+                                         const typename GetTypeReal<T>::type* vector2,
+                                         const bool add)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < size)
+    {
+        if (add)
+        {
+            result[i] += vector1[i] * vector2[i];
+        }
+        else
+        {
+            result[i] = vector1[i] * vector2[i];
+        }
+    }
+}
+
+template <typename T>
+__global__ void vector_div_vector_kernel(const int size,
+                                         T* result,
+                                         const T* vector1,
+                                         const typename GetTypeReal<T>::type* vector2)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < size)
+    {
+        result[i] = vector1[i] / vector2[i];
+    }
+}
+
+template <typename T, typename Real>
+__global__ void constantvector_addORsub_constantVector_kernel(const int size,
+                                                              T* result,
+                                                              const T* vector1,
+                                                              const Real constant1,
+                                                              const T* vector2,
+                                                              const Real constant2)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < size)
+    {
+        result[i] = vector1[i] * constant1 + vector2[i] * constant2;
+    }
+}
+
+// vector operator: result[i] = vector[i] * constant
+template <>
+void vector_mul_real_op<double, base_device::DEVICE_GPU>::operator()(const int dim,
+                                                                     double* result,
+                                                                     const double* vector,
+                                                                     const double constant)
+{
+    // In small cases, 1024 threads per block will only utilize 17 blocks, much less than 40
+    int thread = thread_per_block;
+    int block = (dim + thread - 1) / thread;
+    vector_mul_real_kernel<double><<<block, thread>>>(dim, result, vector, constant);
+
+    cudaCheckOnDebug();
+}
+
+template <typename FPTYPE>
+inline void vector_mul_real_wrapper(const int dim,
+                                    std::complex<FPTYPE>* result,
+                                    const std::complex<FPTYPE>* vector,
+                                    const FPTYPE constant)
+{
+    thrust::complex<FPTYPE>* result_tmp = reinterpret_cast<thrust::complex<FPTYPE>*>(result);
+    const thrust::complex<FPTYPE>* vector_tmp = reinterpret_cast<const thrust::complex<FPTYPE>*>(vector);
+
+    int thread = thread_per_block;
+    int block = (dim + thread - 1) / thread;
+    vector_mul_real_kernel<thrust::complex<FPTYPE>><<<block, thread>>>(dim, result_tmp, vector_tmp, constant);
+
+    cudaCheckOnDebug();
+}
+template <>
+void vector_mul_real_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int dim,
+                                                                                  std::complex<float>* result,
+                                                                                  const std::complex<float>* vector,
+                                                                                  const float constant)
+{
+    vector_mul_real_wrapper(dim, result, vector, constant);
+}
+template <>
+void vector_mul_real_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(const int dim,
+                                                                                   std::complex<double>* result,
+                                                                                   const std::complex<double>* vector,
+                                                                                   const double constant)
+{
+    vector_mul_real_wrapper(dim, result, vector, constant);
+}
+
+// vector operator: result[i] = vector1[i](not complex) * vector2[i](not complex)
+template <>
+void vector_mul_vector_op<double, base_device::DEVICE_GPU>::operator()(const int& dim,
+                                                                       double* result,
+                                                                       const double* vector1,
+                                                                       const double* vector2,
+                                                                       const bool& add)
+{
+    int thread = thread_per_block;
+    int block = (dim + thread - 1) / thread;
+    vector_mul_vector_kernel<double><<<block, thread>>>(dim, result, vector1, vector2, add);
+
+    cudaCheckOnDebug();
+}
+// vector operator: result[i] = vector1[i](complex) * vector2[i](not complex)
+template <typename FPTYPE>
+inline void vector_mul_vector_complex_wrapper(const int& dim,
+                                              std::complex<FPTYPE>* result,
+                                              const std::complex<FPTYPE>* vector1,
+                                              const FPTYPE* vector2,
+                                              const bool& add)
+{
+    thrust::complex<FPTYPE>* result_tmp = reinterpret_cast<thrust::complex<FPTYPE>*>(result);
+    const thrust::complex<FPTYPE>* vector1_tmp = reinterpret_cast<const thrust::complex<FPTYPE>*>(vector1);
+    int thread = thread_per_block;
+    int block = (dim + thread - 1) / thread;
+    vector_mul_vector_kernel<thrust::complex<FPTYPE>><<<block, thread>>>(dim, result_tmp, vector1_tmp, vector2, add);
+
+    cudaCheckOnDebug();
+}
+template <>
+void vector_mul_vector_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int& dim,
+                                                                                    std::complex<float>* result,
+                                                                                    const std::complex<float>* vector1,
+                                                                                    const float* vector2,
+                                                                                    const bool& add)
+{
+    vector_mul_vector_complex_wrapper(dim, result, vector1, vector2, add);
+}
+template <>
+void vector_mul_vector_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(
+    const int& dim,
+    std::complex<double>* result,
+    const std::complex<double>* vector1,
+    const double* vector2,
+    const bool& add)
+{
+    vector_mul_vector_complex_wrapper(dim, result, vector1, vector2, add);
+}
+
+// vector operator: result[i] = vector1[i](not complex) / vector2[i](not complex)
+template <>
+void vector_div_vector_op<double, base_device::DEVICE_GPU>::operator()(const int& dim,
+                                                                       double* result,
+                                                                       const double* vector1,
+                                                                       const double* vector2)
+{
+    int thread = thread_per_block;
+    int block = (dim + thread - 1) / thread;
+    vector_div_vector_kernel<double><<<block, thread>>>(dim, result, vector1, vector2);
+
+    cudaCheckOnDebug();
+}
+// vector operator: result[i] = vector1[i](complex) / vector2[i](not complex)
+template <typename FPTYPE>
+inline void vector_div_vector_complex_wrapper(const int& dim,
+                                              std::complex<FPTYPE>* result,
+                                              const std::complex<FPTYPE>* vector1,
+                                              const FPTYPE* vector2)
+{
+    thrust::complex<FPTYPE>* result_tmp = reinterpret_cast<thrust::complex<FPTYPE>*>(result);
+    const thrust::complex<FPTYPE>* vector1_tmp = reinterpret_cast<const thrust::complex<FPTYPE>*>(vector1);
+    int thread = thread_per_block;
+    int block = (dim + thread - 1) / thread;
+    vector_div_vector_kernel<thrust::complex<FPTYPE>><<<block, thread>>>(dim, result_tmp, vector1_tmp, vector2);
+
+    cudaCheckOnDebug();
+}
+template <>
+void vector_div_vector_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int& dim,
+                                                                                    std::complex<float>* result,
+                                                                                    const std::complex<float>* vector1,
+                                                                                    const float* vector2)
+{
+    vector_div_vector_complex_wrapper(dim, result, vector1, vector2);
+}
+template <>
+void vector_div_vector_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(
+    const int& dim,
+    std::complex<double>* result,
+    const std::complex<double>* vector1,
+    const double* vector2)
+{
+    vector_div_vector_complex_wrapper(dim, result, vector1, vector2);
+}
+
+// vector operator: result[i] = vector1[i] * constant1 + vector2[i] * constant2
+template <typename T>
+void constantvector_addORsub_constantVector_op<T, base_device::DEVICE_GPU>::operator()(const int& dim,
+                                                                                       T* result,
+                                                                                       const T* vector1,
+                                                                                       const Real constant1,
+                                                                                       const T* vector2,
+                                                                                       const Real constant2)
+{
+    using Type = typename GetTypeThrust<T>::type;
+    using Real = typename GetTypeReal<T>::type;
+
+    auto result_tmp = reinterpret_cast<Type*>(result);
+    auto vector1_tmp = reinterpret_cast<const Type*>(vector1);
+    auto vector2_tmp = reinterpret_cast<const Type*>(vector2);
+
+    int thread = thread_per_block;
+    int block = (dim + thread - 1) / thread;
+    constantvector_addORsub_constantVector_kernel<Type, Real>
+        <<<block, thread>>>(dim, result_tmp, vector1_tmp, constant1, vector2_tmp, constant2);
+
+    cudaCheckOnDebug();
+}
+
+template <>
+double dot_real_op<double, base_device::DEVICE_GPU>::operator()(const int& dim,
+                                                                const double* psi_L,
+                                                                const double* psi_R,
+                                                                const bool reduce)
+{
+    double result = 0.0;
+    xdot_wrapper(dim, psi_L, 1, psi_R, 1, result);
+    if (reduce)
+    {
+        Parallel_Reduce::reduce_pool(result);
+    }
+    return result;
+}
+// for this implementation, please check
+// https://thrust.github.io/doc/group__transformed__reductions_ga321192d85c5f510e52300ae762c7e995.html denghui modify
+// 2022-10-03 Note that ddot_(2*dim,a,1,b,1) = REAL( zdotc_(dim,a,1,b,1) ) GPU specialization of actual computation.
+template <typename FPTYPE>
+inline FPTYPE dot_complex_wrapper(const int& dim,
+                                  const std::complex<FPTYPE>* psi_L,
+                                  const std::complex<FPTYPE>* psi_R,
+                                  const bool reduce)
+{
+    //<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+    // denghui modify 2022-10-07
+    // Note that  ddot_(2*dim,a,1,b,1) = REAL( zdotc_(dim,a,1,b,1) )
+    const FPTYPE* pL = reinterpret_cast<const FPTYPE*>(psi_L);
+    const FPTYPE* pR = reinterpret_cast<const FPTYPE*>(psi_R);
+    FPTYPE result = 0.0;
+    xdot_wrapper(dim * 2, pL, 1, pR, 1, result);
+    if (reduce)
+    {
+        Parallel_Reduce::reduce_pool(result);
+    }
+    return result;
+}
+
+template <>
+float dot_real_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int& dim,
+                                                                            const std::complex<float>* psi_L,
+                                                                            const std::complex<float>* psi_R,
+                                                                            const bool reduce)
+{
+    return dot_complex_wrapper(dim, psi_L, psi_R, reduce);
+}
+template <>
+double dot_real_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(const int& dim,
+                                                                              const std::complex<double>* psi_L,
+                                                                              const std::complex<double>* psi_R,
+                                                                              const bool reduce)
+{
+    return dot_complex_wrapper(dim, psi_L, psi_R, reduce);
+}
+
+// Explicitly instantiate functors for the types of functor registered.
+template struct vector_mul_real_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct vector_mul_real_op<double, base_device::DEVICE_GPU>;
+template struct vector_mul_real_op<std::complex<double>, base_device::DEVICE_GPU>;
+
+template struct vector_mul_vector_op<float, base_device::DEVICE_GPU>;
+template struct vector_mul_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct vector_mul_vector_op<double, base_device::DEVICE_GPU>;
+template struct vector_mul_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
+template struct vector_div_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct vector_div_vector_op<double, base_device::DEVICE_GPU>;
+template struct vector_div_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
+
+template struct constantvector_addORsub_constantVector_op<float, base_device::DEVICE_GPU>;
+template struct constantvector_addORsub_constantVector_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct constantvector_addORsub_constantVector_op<double, base_device::DEVICE_GPU>;
+template struct constantvector_addORsub_constantVector_op<std::complex<double>, base_device::DEVICE_GPU>;
+
+template struct dot_real_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct dot_real_op<double, base_device::DEVICE_GPU>;
+template struct dot_real_op<std::complex<double>, base_device::DEVICE_GPU>;
+} // namespace ModuleBase
\ No newline at end of file
diff --git a/source/module_base/kernels/cuda/math_op.cu b/source/module_base/kernels/cuda/math_ylm_op.cu
similarity index 99%
rename from source/module_base/kernels/cuda/math_op.cu
rename to source/module_base/kernels/cuda/math_ylm_op.cu
index cef3c04f3f..390f22b93b 100644
--- a/source/module_base/kernels/cuda/math_op.cu
+++ b/source/module_base/kernels/cuda/math_ylm_op.cu
@@ -1,5 +1,5 @@
 #include <cuda_runtime.h>
-#include "module_base/kernels/math_op.h"
+#include "module_base/kernels/math_ylm_op.h"
 
 #include <base/macros/macros.h>
 
diff --git a/source/module_base/kernels/math_kernel_op.cpp b/source/module_base/kernels/math_kernel_op.cpp
index 9420d7c4b7..9c4ccdc42b 100644
--- a/source/module_base/kernels/math_kernel_op.cpp
+++ b/source/module_base/kernels/math_kernel_op.cpp
@@ -6,241 +6,6 @@
 namespace ModuleBase
 {
 
-template <typename T>
-struct line_minimize_with_block_op<T, base_device::DEVICE_CPU>
-{
-    using Real = typename GetTypeReal<T>::type;
-    void operator()(T* grad_out,
-                    T* hgrad_out,
-                    T* psi_out,
-                    T* hpsi_out,
-                    const int& n_basis,
-                    const int& n_basis_max,
-                    const int& n_band)
-    {
-        for (int band_idx = 0; band_idx < n_band; band_idx++)
-        {
-            Real epsilo_0 = 0.0, epsilo_1 = 0.0, epsilo_2 = 0.0;
-            Real theta = 0.0, cos_theta = 0.0, sin_theta = 0.0;
-            auto A = reinterpret_cast<const Real*>(grad_out + band_idx * n_basis_max);
-            Real norm = BlasConnector::dot(2 * n_basis, A, 1, A, 1);
-            Parallel_Reduce::reduce_pool(norm);
-            norm = 1.0 / sqrt(norm);
-            for (int basis_idx = 0; basis_idx < n_basis; basis_idx++)
-            {
-                auto item = band_idx * n_basis_max + basis_idx;
-                grad_out[item] *= norm;
-                hgrad_out[item] *= norm;
-                epsilo_0 += std::real(hpsi_out[item] * std::conj(psi_out[item]));
-                epsilo_1 += std::real(grad_out[item] * std::conj(hpsi_out[item]));
-                epsilo_2 += std::real(grad_out[item] * std::conj(hgrad_out[item]));
-            }
-            Parallel_Reduce::reduce_pool(epsilo_0);
-            Parallel_Reduce::reduce_pool(epsilo_1);
-            Parallel_Reduce::reduce_pool(epsilo_2);
-            theta = 0.5 * std::abs(std::atan(2 * epsilo_1 / (epsilo_0 - epsilo_2)));
-            cos_theta = std::cos(theta);
-            sin_theta = std::sin(theta);
-            for (int basis_idx = 0; basis_idx < n_basis; basis_idx++)
-            {
-                auto item = band_idx * n_basis_max + basis_idx;
-                psi_out[item] = psi_out[item] * cos_theta + grad_out[item] * sin_theta;
-                hpsi_out[item] = hpsi_out[item] * cos_theta + hgrad_out[item] * sin_theta;
-            }
-        }
-    }
-};
-
-template <typename T>
-struct calc_grad_with_block_op<T, base_device::DEVICE_CPU>
-{
-    using Real = typename GetTypeReal<T>::type;
-    void operator()(const Real* prec_in,
-                    Real* err_out,
-                    Real* beta_out,
-                    T* psi_out,
-                    T* hpsi_out,
-                    T* grad_out,
-                    T* grad_old_out,
-                    const int& n_basis,
-                    const int& n_basis_max,
-                    const int& n_band)
-    {
-        for (int band_idx = 0; band_idx < n_band; band_idx++)
-        {
-            Real err = 0.0;
-            Real beta = 0.0;
-            Real epsilo = 0.0;
-            Real grad_2 = {0.0};
-            T grad_1 = {0.0, 0.0};
-            auto A = reinterpret_cast<const Real*>(psi_out + band_idx * n_basis_max);
-            Real norm = BlasConnector::dot(2 * n_basis, A, 1, A, 1);
-            Parallel_Reduce::reduce_pool(norm);
-            norm = 1.0 / sqrt(norm);
-            for (int basis_idx = 0; basis_idx < n_basis; basis_idx++)
-            {
-                auto item = band_idx * n_basis_max + basis_idx;
-                psi_out[item] *= norm;
-                hpsi_out[item] *= norm;
-                epsilo += std::real(hpsi_out[item] * std::conj(psi_out[item]));
-            }
-            Parallel_Reduce::reduce_pool(epsilo);
-            for (int basis_idx = 0; basis_idx < n_basis; basis_idx++)
-            {
-                auto item = band_idx * n_basis_max + basis_idx;
-                grad_1 = hpsi_out[item] - epsilo * psi_out[item];
-                grad_2 = std::norm(grad_1);
-                err += grad_2;
-                beta += grad_2 / prec_in[basis_idx]; /// Mark here as we should div the prec?
-            }
-            Parallel_Reduce::reduce_pool(err);
-            Parallel_Reduce::reduce_pool(beta);
-            for (int basis_idx = 0; basis_idx < n_basis; basis_idx++)
-            {
-                auto item = band_idx * n_basis_max + basis_idx;
-                grad_1 = hpsi_out[item] - epsilo * psi_out[item];
-                grad_out[item] = -grad_1 / prec_in[basis_idx] + beta / beta_out[band_idx] * grad_old_out[item];
-            }
-            beta_out[band_idx] = beta;
-            err_out[band_idx] = sqrt(err);
-        }
-    }
-};
-
-template <typename FPTYPE>
-struct dot_real_op<FPTYPE, base_device::DEVICE_CPU>
-{
-    FPTYPE operator()(const int& dim,
-                      const FPTYPE* psi_L,
-                      const FPTYPE* psi_R,
-                      const bool reduce)
-    {
-        FPTYPE result = BlasConnector::dot(dim, psi_L, 1, psi_R, 1);
-        if (reduce)
-        {
-            Parallel_Reduce::reduce_pool(result);
-        }
-        return result;
-    }
-};
-
-// CPU specialization of actual computation.
-template <typename FPTYPE>
-struct dot_real_op<std::complex<FPTYPE>, base_device::DEVICE_CPU>
-{
-    FPTYPE operator()(const int& dim,
-                      const std::complex<FPTYPE>* psi_L,
-                      const std::complex<FPTYPE>* psi_R,
-                      const bool reduce)
-    {
-        //<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-        // qianrui modify 2021-3-14
-        // Note that  ddot_(2*dim,a,1,b,1) = REAL( zdotc_(dim,a,1,b,1) )
-        const FPTYPE* pL = reinterpret_cast<const FPTYPE*>(psi_L);
-        const FPTYPE* pR = reinterpret_cast<const FPTYPE*>(psi_R);
-        FPTYPE result = BlasConnector::dot(2 * dim, pL, 1, pR, 1);
-        if (reduce)
-        {
-            Parallel_Reduce::reduce_pool(result);
-        }
-        return result;
-    }
-};
-
-template <typename T>
-struct vector_div_constant_op<T, base_device::DEVICE_CPU>
-{
-    using Real = typename GetTypeReal<T>::type;
-    void operator()(const int dim, T* result, const T* vector, const Real constant)
-    {
-#ifdef _OPENMP
-#pragma omp parallel for schedule(static, 4096 / sizeof(Real))
-#endif
-        for (int i = 0; i < dim; i++)
-        {
-            result[i] = vector[i] / constant;
-        }
-    }
-};
-
-template <typename T>
-struct vector_mul_vector_op<T, base_device::DEVICE_CPU>
-{
-    using Real = typename GetTypeReal<T>::type;
-    void operator()(const int& dim, T* result, const T* vector1, const Real* vector2, const bool& add)
-    {
-        if (add)
-        {
-#ifdef _OPENMP
-#pragma omp parallel for schedule(static, 4096 / sizeof(Real))
-#endif
-            for (int i = 0; i < dim; i++)
-            {
-                result[i] += vector1[i] * vector2[i];
-            }
-        }
-        else
-        {
-#ifdef _OPENMP
-#pragma omp parallel for schedule(static, 4096 / sizeof(Real))
-#endif
-            for (int i = 0; i < dim; i++)
-            {
-                result[i] = vector1[i] * vector2[i];
-            }
-        }
-    }
-};
-
-template <typename T>
-struct vector_div_vector_op<T, base_device::DEVICE_CPU>
-{
-    using Real = typename GetTypeReal<T>::type;
-    void operator()(const int& dim, T* result, const T* vector1, const Real* vector2)
-    {
-#ifdef _OPENMP
-#pragma omp parallel for schedule(static, 4096 / sizeof(Real))
-#endif
-        for (int i = 0; i < dim; i++)
-        {
-            result[i] = vector1[i] / vector2[i];
-        }
-    }
-};
-
-template <typename T>
-struct constantvector_addORsub_constantVector_op<T, base_device::DEVICE_CPU>
-{
-    using Real = typename GetTypeReal<T>::type;
-    void operator()(const int& dim,
-                    T* result,
-                    const T* vector1,
-                    const Real constant1,
-                    const T* vector2,
-                    const Real constant2)
-    {
-#ifdef _OPENMP
-#pragma omp parallel for schedule(static, 8192 / sizeof(T))
-#endif
-        for (int i = 0; i < dim; i++)
-        {
-            result[i] = vector1[i] * constant1 + vector2[i] * constant2;
-        }
-    }
-};
-
-template <typename FPTYPE>
-struct scal_op<FPTYPE, base_device::DEVICE_CPU>
-{
-    void operator()(const int& N,
-                    const std::complex<FPTYPE>* alpha,
-                    std::complex<FPTYPE>* X,
-                    const int& incx)
-    {
-        BlasConnector::scal(N, *alpha, X, incx);
-    }
-};
-
 template <typename T>
 struct gemv_op<T, base_device::DEVICE_CPU>
 {
@@ -260,20 +25,6 @@ struct gemv_op<T, base_device::DEVICE_CPU>
     }
 };
 
-template <typename T>
-struct axpy_op<T, base_device::DEVICE_CPU>
-{
-    void operator()(const int& dim,
-                    const T* alpha,
-                    const T* X,
-                    const int& incX,
-                    T* Y,
-                    const int& incY)
-    {
-        BlasConnector::axpy(dim, *alpha, X, incX, Y, incY);
-    }
-};
-
 template <typename T>
 struct gemm_op<T, base_device::DEVICE_CPU>
 {
@@ -367,48 +118,23 @@ struct matrixCopy<T, base_device::DEVICE_CPU>
     }
 };
 
-// Explicitly instantiate functors for the types of functor registered.
-template struct scal_op<float, base_device::DEVICE_CPU>;
-template struct axpy_op<std::complex<float>, base_device::DEVICE_CPU>;
 template struct gemv_op<std::complex<float>, base_device::DEVICE_CPU>;
 template struct gemv_op<float, base_device::DEVICE_CPU>;
 template struct gemm_op<std::complex<float>, base_device::DEVICE_CPU>;
 template struct gemm_op<float, base_device::DEVICE_CPU>;
-template struct dot_real_op<std::complex<float>, base_device::DEVICE_CPU>;
-template struct vector_div_constant_op<std::complex<float>, base_device::DEVICE_CPU>;
-template struct vector_mul_vector_op<std::complex<float>, base_device::DEVICE_CPU>;
-template struct vector_div_vector_op<std::complex<float>, base_device::DEVICE_CPU>;
-template struct constantvector_addORsub_constantVector_op<std::complex<float>, base_device::DEVICE_CPU>;
 template struct matrixTranspose_op<std::complex<float>, base_device::DEVICE_CPU>;
 template struct matrixCopy<std::complex<float>, base_device::DEVICE_CPU>;
-template struct calc_grad_with_block_op<std::complex<float>, base_device::DEVICE_CPU>;
-template struct line_minimize_with_block_op<std::complex<float>, base_device::DEVICE_CPU>;
 
-template struct scal_op<double, base_device::DEVICE_CPU>;
-template struct axpy_op<std::complex<double>, base_device::DEVICE_CPU>;
-template struct axpy_op<double, base_device::DEVICE_CPU>;
 template struct gemv_op<std::complex<double>, base_device::DEVICE_CPU>;
 template struct gemv_op<double, base_device::DEVICE_CPU>;
 template struct gemm_op<std::complex<double>, base_device::DEVICE_CPU>;
 template struct gemm_op<double, base_device::DEVICE_CPU>;
-template struct dot_real_op<std::complex<double>, base_device::DEVICE_CPU>;
-template struct dot_real_op<double, base_device::DEVICE_CPU>;
-template struct vector_div_constant_op<std::complex<double>, base_device::DEVICE_CPU>;
-template struct vector_mul_vector_op<std::complex<double>, base_device::DEVICE_CPU>;
-template struct vector_div_vector_op<std::complex<double>, base_device::DEVICE_CPU>;
-template struct constantvector_addORsub_constantVector_op<std::complex<double>, base_device::DEVICE_CPU>;
 template struct matrixTranspose_op<std::complex<double>, base_device::DEVICE_CPU>;
 template struct matrixCopy<double, base_device::DEVICE_CPU>;
 template struct matrixCopy<std::complex<double>, base_device::DEVICE_CPU>;
-template struct calc_grad_with_block_op<std::complex<double>, base_device::DEVICE_CPU>;
-template struct line_minimize_with_block_op<std::complex<double>, base_device::DEVICE_CPU>;
 
 #ifdef __LCAO
-template struct vector_mul_vector_op<double, base_device::DEVICE_CPU>;
-template struct vector_div_constant_op<double, base_device::DEVICE_CPU>;
-template struct vector_div_vector_op<double, base_device::DEVICE_CPU>;
 template struct matrixTranspose_op<double, base_device::DEVICE_CPU>;
-template struct constantvector_addORsub_constantVector_op<double, base_device::DEVICE_CPU>;
 #endif
 #ifdef __DSP
 template struct gemm_op_mt<std::complex<float>, base_device::DEVICE_CPU>;
diff --git a/source/module_base/kernels/math_kernel_op.h b/source/module_base/kernels/math_kernel_op.h
index daca2ac548..fdb6d24457 100644
--- a/source/module_base/kernels/math_kernel_op.h
+++ b/source/module_base/kernels/math_kernel_op.h
@@ -19,6 +19,9 @@
 
 namespace ModuleBase {
 
+//---------------------------------------------------------------------------------
+//-----------------------------0. Tool Functions-----------------------------------
+//---------------------------------------------------------------------------------
 inline std::complex<double> set_real_tocomplex(const std::complex<double> &x) {
   return {x.real(), 0.0};
 }
@@ -43,68 +46,30 @@ inline double get_conj(const double &x) { return x; }
 
 inline float get_conj(const float &x) { return x; }
 
-template <typename T, typename Device> struct line_minimize_with_block_op {
-  /// @brief dot_real_op computes the dot product of the given complex
-  /// arrays(treated as float arrays). And there's may have MPI communications
-  /// while enabling planewave parallization strategy.
-  ///
-  /// Input Parameters
-  /// \param dev : the type of computing device
-  /// \param A : input array arr
-  /// \param dim : size of A
-  /// \param lda : leading dimention of A
-  /// \param batch : batch size, the size of the result array res
-  ///
-  /// \return res : the result vector
-  /// T : dot product result
-  void operator()(T *grad_out, T *hgrad_out, T *psi_out, T *hpsi_out,
-                  const int &n_basis, const int &n_basis_max,
-                  const int &n_band);
-};
-
-template <typename T, typename Device> struct calc_grad_with_block_op {
-  /// @brief dot_real_op computes the dot product of the given complex
-  /// arrays(treated as float arrays). And there's may have MPI communications
-  /// while enabling planewave parallization strategy.
-  ///
-  /// Input Parameters
-  /// \param dev : the type of computing device
-  /// \param A : input array arr
-  /// \param dim : size of A
-  /// \param lda : leading dimention of A
-  /// \param batch : batch size, the size of the result array res
-  ///
-  /// \return res : the result vector
-  /// T : dot product result
-  using Real = typename GetTypeReal<T>::type;
-  void operator()(const Real *prec_in, Real *err_out, Real *beta_out,
-                  T *psi_out, T *hpsi_out, T *grad_out, T *grad_old_out,
-                  const int &n_basis, const int &n_basis_max,
-                  const int &n_band);
-};
 
-template <typename T, typename Device> struct dot_real_op {
-  using Real = typename GetTypeReal<T>::type;
-  /// @brief dot_real_op computes the dot product of the given complex
-  /// arrays(treated as float arrays). And there's may have MPI communications
-  /// while enabling planewave parallization strategy.
+//---------------------------------------------------------------------------------
+//-----------------------------1. Vector Operations--------------------------------
+//---------------------------------------------------------------------------------
+template <typename FPTYPE, typename Device> struct scal_op {
+  /// @brief x = alpha * x, where alpha and x are complex numbers
   ///
   /// Input Parameters
-  /// \param dim : array size
-  /// \param psi_L : input array A
-  /// \param psi_R : input array B
-  /// \param reduce : flag to control whether to perform the MPI communications
+  /// \param N : array size
+  /// \param alpha : input constant
+  /// \param X : input array
+  /// \param incx : computing strip of array X
   ///
-  /// \return
-  /// FPTYPE : dot product result
-  Real operator()(const int &dim, const T *psi_L,
-                  const T *psi_R, const bool reduce = true);
+  /// Output Parameters
+  /// \param X : output array
+  void operator()(const int &N,
+                  const std::complex<FPTYPE> *alpha, std::complex<FPTYPE> *X,
+                  const int &incx);
 };
 
-// vector operator: result[i] = vector[i] / constant
-template <typename T, typename Device> struct vector_div_constant_op {
+template <typename T, typename Device> struct vector_mul_real_op {
   using Real = typename GetTypeReal<T>::type;
-  /// @brief result[i] = vector[i] / constant
+  /// @brief result[i] = vector[i] * constant, where vector is complex number and constant is real number。
+  ///        It is different from the scal_op, which is used to multiply a complex number by a complex number.
   ///
   /// Input Parameters
   /// \param dim : array size
@@ -113,25 +78,8 @@ template <typename T, typename Device> struct vector_div_constant_op {
   ///
   /// Output Parameters
   /// \param result : output array
-  void operator()(const int dim, T *result, const T *vector,
-                  const Real constant);
-};
-
-// replace vector_div_constant_op : x = alpha * x
-template <typename FPTYPE, typename Device> struct scal_op {
-  /// @brief x = alpha * x
-  ///
-  /// Input Parameters
-  /// \param N : array size
-  /// \param alpha : input constant
-  /// \param X : input array
-  /// \param incx : computing strip of array X
-  ///
-  /// Output Parameters
-  /// \param X : output array
-  void operator()(const int &N,
-                  const std::complex<FPTYPE> *alpha, std::complex<FPTYPE> *X,
-                  const int &incx);
+  /// \note Use mulitple instead of divide. It is faster.
+  void operator()(const int dim, T* result, const T* vector, const Real constant);
 };
 
 // vector operator: result[i] = vector1[i](complex) * vector2[i](not complex)
@@ -166,6 +114,24 @@ template <typename T, typename Device> struct vector_div_vector_op {
                   const Real *vector2);
 };
 
+//  compute Y = alpha * X + Y
+template <typename T, typename Device> struct axpy_op {
+  /// @brief Y = alpha * X + Y
+  ///
+  /// Input Parameters
+  /// \param N : array size
+  /// \param alpha : input constant alpha
+  /// \param X : input array X
+  /// \param incX : computing strip of X
+  /// \param Y : computing strip of Y
+  /// \param incY : computing strip of Y
+  ///
+  /// Output Parameters
+  /// \param Y : output array Y
+  void operator()(const int &N, const T *alpha, const T *X,
+                  const int &incX, T *Y, const int &incY);
+};
+
 // vector operator: result[i] = vector1[i] * constant1 + vector2[i] * constant2
 template <typename T, typename Device>
 struct constantvector_addORsub_constantVector_op {
@@ -185,24 +151,29 @@ struct constantvector_addORsub_constantVector_op {
                   const Real constant1, const T *vector2, const Real constant2);
 };
 
-//  compute Y = alpha * X + Y
-template <typename T, typename Device> struct axpy_op {
-  /// @brief Y = alpha * X + Y
+template <typename T, typename Device> struct dot_real_op {
+  using Real = typename GetTypeReal<T>::type;
+  /// @brief dot_real_op computes the dot product of the given complex
+  /// arrays(treated as float arrays). And there's may have MPI communications
+  /// while enabling planewave parallization strategy.
   ///
   /// Input Parameters
-  /// \param N : array size
-  /// \param alpha : input constant alpha
-  /// \param X : input array X
-  /// \param incX : computing strip of X
-  /// \param Y : computing strip of Y
-  /// \param incY : computing strip of Y
+  /// \param dim : array size
+  /// \param psi_L : input array A
+  /// \param psi_R : input array B
+  /// \param reduce : flag to control whether to perform the MPI communications
   ///
-  /// Output Parameters
-  /// \param Y : output array Y
-  void operator()(const int &N, const T *alpha, const T *X,
-                  const int &incX, T *Y, const int &incY);
+  /// \return
+  /// FPTYPE : dot product result
+  Real operator()(const int &dim, const T *psi_L,
+                  const T *psi_R, const bool reduce = true);
 };
 
+
+//---------------------------------------------------------------------------------
+//-----------------------------2. Matrix Operations--------------------------------
+//---------------------------------------------------------------------------------
+
 // compute y = alpha * op(A) * x + beta * y
 template <typename T, typename Device> struct gemv_op {
   /// @brief y = alpha * op(A) * x + beta * y
@@ -314,24 +285,6 @@ template <typename T, typename Device> struct matrixCopy {
 };
 
 #if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
-
-template <typename T>
-struct line_minimize_with_block_op<T, base_device::DEVICE_GPU> {
-  using Real = typename GetTypeReal<T>::type;
-  void operator()(T *grad_out, T *hgrad_out, T *psi_out, T *hpsi_out,
-                  const int &n_basis, const int &n_basis_max,
-                  const int &n_band);
-};
-
-template <typename T>
-struct calc_grad_with_block_op<T, base_device::DEVICE_GPU> {
-  using Real = typename GetTypeReal<T>::type;
-  void operator()(const Real *prec_in, Real *err_out, Real *beta_out,
-                  T *psi_out, T *hpsi_out, T *grad_out, T *grad_old_out,
-                  const int &n_basis, const int &n_basis_max,
-                  const int &n_band);
-};
-
 // Partially specialize functor for base_device::GpuDevice.
 template <typename T> struct dot_real_op<T, base_device::DEVICE_GPU> {
   using Real = typename GetTypeReal<T>::type;
@@ -341,10 +294,10 @@ template <typename T> struct dot_real_op<T, base_device::DEVICE_GPU> {
 
 // vector operator: result[i] = vector[i] / constant
 template <typename T>
-struct vector_div_constant_op<T, base_device::DEVICE_GPU> {
+struct vector_mul_real_op<T, base_device::DEVICE_GPU>
+{
   using Real = typename GetTypeReal<T>::type;
-  void operator()(const int dim, T *result,
-                  const T *vector, const Real constant);
+  void operator()(const int dim, T* result, const T* vector, const Real constant);
 };
 
 // vector operator: result[i] = vector1[i](complex) * vector2[i](not complex)
diff --git a/source/module_base/kernels/math_kernel_op_vec.cpp b/source/module_base/kernels/math_kernel_op_vec.cpp
new file mode 100644
index 0000000000..747857c580
--- /dev/null
+++ b/source/module_base/kernels/math_kernel_op_vec.cpp
@@ -0,0 +1,177 @@
+#include "module_base/kernels/math_kernel_op.h"
+
+namespace ModuleBase
+{
+
+template <typename FPTYPE>
+struct scal_op<FPTYPE, base_device::DEVICE_CPU>
+{
+    void operator()(const int& N,
+                    const std::complex<FPTYPE>* alpha,
+                    std::complex<FPTYPE>* X,
+                    const int& incx)
+    {
+        BlasConnector::scal(N, *alpha, X, incx);
+    }
+};
+
+template <typename T>
+struct vector_mul_real_op<T, base_device::DEVICE_CPU>
+{
+    using Real = typename GetTypeReal<T>::type;
+    void operator()(const int dim, T* result, const T* vector, const Real constant)
+    {
+#ifdef _OPENMP
+#pragma omp parallel for schedule(static, 4096 / sizeof(Real))
+#endif
+        for (int i = 0; i < dim; i++)
+        {
+            result[i] = vector[i] * constant;
+        }
+    }
+};
+
+template <typename T>
+struct vector_mul_vector_op<T, base_device::DEVICE_CPU>
+{
+    using Real = typename GetTypeReal<T>::type;
+    void operator()(const int& dim, T* result, const T* vector1, const Real* vector2, const bool& add)
+    {
+        if (add)
+        {
+#ifdef _OPENMP
+#pragma omp parallel for schedule(static, 4096 / sizeof(Real))
+#endif
+            for (int i = 0; i < dim; i++)
+            {
+                result[i] += vector1[i] * vector2[i];
+            }
+        }
+        else
+        {
+#ifdef _OPENMP
+#pragma omp parallel for schedule(static, 4096 / sizeof(Real))
+#endif
+            for (int i = 0; i < dim; i++)
+            {
+                result[i] = vector1[i] * vector2[i];
+            }
+        }
+    }
+};
+
+template <typename T>
+struct vector_div_vector_op<T, base_device::DEVICE_CPU>
+{
+    using Real = typename GetTypeReal<T>::type;
+    void operator()(const int& dim, T* result, const T* vector1, const Real* vector2)
+    {
+#ifdef _OPENMP
+#pragma omp parallel for schedule(static, 4096 / sizeof(Real))
+#endif
+        for (int i = 0; i < dim; i++)
+        {
+            result[i] = vector1[i] / vector2[i];
+        }
+    }
+};
+
+template <typename T>
+struct axpy_op<T, base_device::DEVICE_CPU>
+{
+    void operator()(const int& dim,
+                    const T* alpha,
+                    const T* X,
+                    const int& incX,
+                    T* Y,
+                    const int& incY)
+    {
+        BlasConnector::axpy(dim, *alpha, X, incX, Y, incY);
+    }
+};
+
+
+template <typename T>
+struct constantvector_addORsub_constantVector_op<T, base_device::DEVICE_CPU>
+{
+    using Real = typename GetTypeReal<T>::type;
+    void operator()(const int& dim,
+                    T* result,
+                    const T* vector1,
+                    const Real constant1,
+                    const T* vector2,
+                    const Real constant2)
+    {
+#ifdef _OPENMP
+#pragma omp parallel for schedule(static, 8192 / sizeof(T))
+#endif
+        for (int i = 0; i < dim; i++)
+        {
+            result[i] = vector1[i] * constant1 + vector2[i] * constant2;
+        }
+    }
+};
+
+
+
+template <typename FPTYPE>
+struct dot_real_op<FPTYPE, base_device::DEVICE_CPU>
+{
+    FPTYPE operator()(const int& dim, const FPTYPE* psi_L, const FPTYPE* psi_R, const bool reduce)
+    {
+        FPTYPE result = BlasConnector::dot(dim, psi_L, 1, psi_R, 1);
+        if (reduce)
+        {
+            Parallel_Reduce::reduce_pool(result);
+        }
+        return result;
+    }
+};
+
+template <typename FPTYPE>
+struct dot_real_op<std::complex<FPTYPE>, base_device::DEVICE_CPU>
+{
+    FPTYPE operator()(const int& dim,
+                      const std::complex<FPTYPE>* psi_L,
+                      const std::complex<FPTYPE>* psi_R,
+                      const bool reduce)
+    {
+        // Note that  ddot_(2*dim,a,1,b,1) = REAL( zdotc_(dim,a,1,b,1) )
+        const FPTYPE* pL = reinterpret_cast<const FPTYPE*>(psi_L);
+        const FPTYPE* pR = reinterpret_cast<const FPTYPE*>(psi_R);
+        FPTYPE result = BlasConnector::dot(2 * dim, pL, 1, pR, 1);
+        if (reduce)
+        {
+            Parallel_Reduce::reduce_pool(result);
+        }
+        return result;
+    }
+};
+
+template struct scal_op<float, base_device::DEVICE_CPU>;
+template struct scal_op<double, base_device::DEVICE_CPU>;
+
+template struct vector_mul_real_op<std::complex<float>, base_device::DEVICE_CPU>;
+template struct vector_mul_real_op<double, base_device::DEVICE_CPU>;
+template struct vector_mul_real_op<std::complex<double>, base_device::DEVICE_CPU>;
+
+template struct vector_mul_vector_op<std::complex<float>, base_device::DEVICE_CPU>;
+template struct vector_mul_vector_op<double, base_device::DEVICE_CPU>;
+template struct vector_mul_vector_op<std::complex<double>, base_device::DEVICE_CPU>;
+
+template struct vector_div_vector_op<std::complex<float>, base_device::DEVICE_CPU>;
+template struct vector_div_vector_op<double, base_device::DEVICE_CPU>;
+template struct vector_div_vector_op<std::complex<double>, base_device::DEVICE_CPU>;
+
+template struct axpy_op<std::complex<float>, base_device::DEVICE_CPU>;
+template struct axpy_op<std::complex<double>, base_device::DEVICE_CPU>;
+template struct axpy_op<double, base_device::DEVICE_CPU>;
+
+template struct constantvector_addORsub_constantVector_op<std::complex<float>, base_device::DEVICE_CPU>;
+template struct constantvector_addORsub_constantVector_op<double, base_device::DEVICE_CPU>;
+template struct constantvector_addORsub_constantVector_op<std::complex<double>, base_device::DEVICE_CPU>;
+
+template struct dot_real_op<std::complex<float>, base_device::DEVICE_CPU>;
+template struct dot_real_op<std::complex<double>, base_device::DEVICE_CPU>;
+template struct dot_real_op<double, base_device::DEVICE_CPU>;
+} // namespace ModuleBase
diff --git a/source/module_base/kernels/math_op.cpp b/source/module_base/kernels/math_ylm_op.cpp
similarity index 99%
rename from source/module_base/kernels/math_op.cpp
rename to source/module_base/kernels/math_ylm_op.cpp
index 1abf464df1..0846801797 100644
--- a/source/module_base/kernels/math_op.cpp
+++ b/source/module_base/kernels/math_ylm_op.cpp
@@ -1,4 +1,4 @@
-#include "module_base/kernels/math_op.h"
+#include "module_base/kernels/math_ylm_op.h"
 #include "module_base/libm/libm.h"
 
 namespace ModuleBase {
diff --git a/source/module_base/kernels/math_op.h b/source/module_base/kernels/math_ylm_op.h
similarity index 100%
rename from source/module_base/kernels/math_op.h
rename to source/module_base/kernels/math_ylm_op.h
diff --git a/source/module_base/kernels/rocm/math_kernel_op.hip.cu b/source/module_base/kernels/rocm/math_kernel_op.hip.cu
index a6a787faf8..1211b76b6d 100644
--- a/source/module_base/kernels/rocm/math_kernel_op.hip.cu
+++ b/source/module_base/kernels/rocm/math_kernel_op.hip.cu
@@ -7,10 +7,6 @@
 #include <hip/hip_runtime.h>
 #include <hipblas/hipblas.h>
 #include <thrust/complex.h>
-
-#define WARP_SIZE 32
-#define FULL_MASK 0xffffffff
-#define THREAD_PER_BLOCK 256
 template <>
 struct GetTypeReal<thrust::complex<float>> {
     using type = float; /**< The return type specialization for std::complex<double>. */
@@ -39,12 +35,10 @@ struct GetTypeThrust<std::complex<double>> {
 
 static hipblasHandle_t cublas_handle = nullptr;
 
-static inline
 void xdot_wrapper(const int &n, const float * x, const int &incx, const float * y, const int &incy, float &result) {
     hipblasErrcheck(hipblasSdot(cublas_handle, n, x, incx, y, incy, &result));
 }
 
-static inline
 void xdot_wrapper(const int &n, const double * x, const int &incx, const double * y, const int &incy, double &result) {
     hipblasErrcheck(hipblasDdot(cublas_handle, n, x, incx, y, incy, &result));
 }
@@ -62,239 +56,62 @@ void destoryBLAShandle(){
     }
 }
 
-template <typename Real>
-__global__ void line_minimize_with_block(
-        thrust::complex<Real>* grad,
-        thrust::complex<Real>* hgrad,
-        thrust::complex<Real>* psi,
-        thrust::complex<Real>* hpsi,
-        const int n_basis,
-        const int n_basis_max)
-{
-    int band_idx = blockIdx.x; // band_idx
-    int tid = threadIdx.x; // basis_idx
-    int item = 0;
-    Real epsilo_0 = 0.0, epsilo_1 = 0.0, epsilo_2 = 0.0;
-    Real theta = 0.0, cos_theta = 0.0, sin_theta = 0.0;
-    __shared__ Real data[THREAD_PER_BLOCK * 3];
-
-    data[tid] = 0;
-
-    for (int basis_idx = tid; basis_idx < n_basis; basis_idx += THREAD_PER_BLOCK) {
-        item = band_idx * n_basis_max + basis_idx;
-        data[tid] += (grad[item] * thrust::conj(grad[item])).real();
-    }
-    __syncthreads();
-    // just do some parallel reduction in shared memory
-    for (int ii = THREAD_PER_BLOCK >> 1; ii > 0; ii >>= 1) {
-        if (tid < ii) {
-            data[tid] += data[tid + ii];
-        }
-        __syncthreads();
-    }
-
-    Real norm = 1.0 / sqrt(data[0]);
-    __syncthreads();
-
-    data[tid] = 0;
-    data[THREAD_PER_BLOCK + tid] = 0;
-    data[2 * THREAD_PER_BLOCK + tid] = 0;
-    for (int basis_idx = tid; basis_idx < n_basis; basis_idx += THREAD_PER_BLOCK) {
-        item = band_idx * n_basis_max + basis_idx;
-        grad[item] *= norm;
-        hgrad[item] *= norm;
-        data[tid] += (hpsi[item] * thrust::conj(psi[item])).real();
-        data[THREAD_PER_BLOCK + tid] += (grad[item] * thrust::conj(hpsi[item])).real();
-        data[2 * THREAD_PER_BLOCK + tid] += (grad[item] * thrust::conj(hgrad[item])).real();
-    }
-    __syncthreads();
-
-    // just do some parallel reduction in shared memory
-    for (int ii = THREAD_PER_BLOCK >> 1; ii > 0; ii >>= 1) {
-        if (tid < ii) {
-            data[tid] += data[tid + ii];
-            data[THREAD_PER_BLOCK + tid] += data[THREAD_PER_BLOCK + tid + ii];
-            data[2 * THREAD_PER_BLOCK + tid] += data[2 * THREAD_PER_BLOCK + tid + ii];
-        }
-        __syncthreads();
-    }
-    epsilo_0 = data[0];
-    epsilo_1 = data[THREAD_PER_BLOCK];
-    epsilo_2 = data[2 * THREAD_PER_BLOCK];
-
-    theta = 0.5 * abs(atan(2 * epsilo_1/(epsilo_0 - epsilo_2)));
-    cos_theta = cos(theta);
-    sin_theta = sin(theta);
-    for (int basis_idx = tid; basis_idx < n_basis; basis_idx += THREAD_PER_BLOCK) {
-        item = band_idx * n_basis_max + basis_idx;
-        psi [item] = psi [item] * cos_theta + grad [item] * sin_theta;
-        hpsi[item] = hpsi[item] * cos_theta + hgrad[item] * sin_theta;
-    }
-}
-
-template <typename Real>
-__global__ void calc_grad_with_block(
-        const Real* prec,
-        Real* err,
-        Real* beta,
-        thrust::complex<Real>* psi,
-        thrust::complex<Real>* hpsi,
-        thrust::complex<Real>* grad,
-        thrust::complex<Real>* grad_old,
-        const int n_basis,
-        const int n_basis_max)
+template <>
+void scal_op<float, base_device::DEVICE_GPU>::operator()(const int& N,
+                                                         const std::complex<float>* alpha,
+                                                         std::complex<float>* X,
+                                                         const int& incx)
 {
-    int band_idx = blockIdx.x; // band_idx
-    int tid = threadIdx.x; // basis_idx
-    int item = 0;
-    Real err_st = 0.0;
-    Real beta_st = 0.0;
-    Real epsilo = 0.0;
-    Real grad_2 = 0.0;
-    thrust::complex<Real> grad_1 = {0, 0};
-    __shared__ Real data[THREAD_PER_BLOCK * 2];
-
-    // Init shared memory
-    data[tid] = 0;
-
-    for (int basis_idx = tid; basis_idx < n_basis; basis_idx += THREAD_PER_BLOCK) {
-        item = band_idx * n_basis_max + basis_idx;
-        data[tid] += (psi[item] * thrust::conj(psi[item])).real();
-    }
-    __syncthreads();
-    // just do some parallel reduction in shared memory
-    for (int ii = THREAD_PER_BLOCK >> 1; ii > 0; ii >>= 1) {
-        if (tid < ii) {
-            data[tid] += data[tid + ii];
-        }
-        __syncthreads();
-    }
-
-    Real norm = 1.0 / sqrt(data[0]);
-    __syncthreads();
-
-    data[tid] = 0;
-    for (int basis_idx = tid; basis_idx < n_basis; basis_idx += THREAD_PER_BLOCK) {
-        item = band_idx * n_basis_max + basis_idx;
-        psi[item] *= norm;
-        hpsi[item] *= norm;
-        data[tid] += (hpsi[item] * thrust::conj(psi[item])).real();
-    }
-    __syncthreads();
-
-    // just do some parallel reduction in shared memory
-    for (int ii = THREAD_PER_BLOCK >> 1; ii > 0; ii >>= 1) {
-        if (tid < ii) {
-            data[tid] += data[tid + ii];
-        }
-        __syncthreads();
-    }
-    epsilo = data[0];
-    __syncthreads();
-
-    data[tid] = 0;
-    data[THREAD_PER_BLOCK + tid] = 0;
-    for (int basis_idx = tid; basis_idx < n_basis; basis_idx += THREAD_PER_BLOCK) {
-        item = band_idx * n_basis_max + basis_idx;
-        grad_1 = hpsi[item] - epsilo * psi[item];
-        grad_2 = thrust::norm(grad_1);
-        data[tid] += grad_2;
-        data[THREAD_PER_BLOCK + tid] += grad_2 / prec[basis_idx];
-    }
-    __syncthreads();
-
-    // just do some parallel reduction in shared memory
-    for (int ii = THREAD_PER_BLOCK >> 1; ii > 0; ii >>= 1) {
-        if (tid < ii) {
-            data[tid] += data[tid + ii];
-            data[THREAD_PER_BLOCK + tid] += data[THREAD_PER_BLOCK + tid + ii];
-        }
-        __syncthreads();
-    }
-    err_st = data[0];
-    beta_st = data[THREAD_PER_BLOCK];
-    for (int basis_idx = tid; basis_idx < n_basis; basis_idx += THREAD_PER_BLOCK) {
-        item = band_idx * n_basis_max + basis_idx;
-        grad_1 = hpsi[item] - epsilo * psi[item];
-        grad[item] = -grad_1 / prec[basis_idx] + beta_st / beta[band_idx] * grad_old[item];
-    }
-
-    __syncthreads();
-    if (tid == 0) {
-        beta[band_idx] = beta_st;
-        err[band_idx] = sqrt(err_st);
-    }
+    hipblasErrcheck(hipblasCscal(cublas_handle, N, (hipblasComplex*)alpha, (hipblasComplex*)X, incx));
 }
 
-// Define the CUDA kernel:
-template <typename T>
-__launch_bounds__(1024)
-__global__ void vector_div_constant_kernel(
-    const int size,
-    T* result,
-    const T* vector,
-    const typename GetTypeReal<T>::type constant)
+template <>
+void scal_op<double, base_device::DEVICE_GPU>::operator()(const int& N,
+                                                          const std::complex<double>* alpha,
+                                                          std::complex<double>* X,
+                                                          const int& incx)
 {
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i < size)
-    {
-        result[i] = vector[i] / constant;
-    }
+    hipblasErrcheck(hipblasZscal(cublas_handle, N, (hipblasDoubleComplex*)alpha, (hipblasDoubleComplex*)X, incx));
 }
 
-template <typename T>
-__launch_bounds__(1024)
-__global__ void vector_mul_vector_kernel(
-    const int size,
-    T* result,
-    const T* vector1,
-    const typename GetTypeReal<T>::type* vector2,
-    const bool add)
+template <>
+void axpy_op<double, base_device::DEVICE_GPU>::operator()(const int& N,
+                                                          const double* alpha,
+                                                          const double* X,
+                                                          const int& incX,
+                                                          double* Y,
+                                                          const int& incY)
 {
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i < size)
-    {
-        if (add)
-        {
-            result[i] += vector1[i] * vector2[i];
-        }
-        else
-        {
-            result[i] = vector1[i] * vector2[i];
-        }
-    }
+    hipblasErrcheck(hipblasDaxpy(cublas_handle, N, alpha, X, incX, Y, incY));
 }
 
-template <typename T>
-__launch_bounds__(1024)
-__global__ void vector_div_vector_kernel(
-    const int size,
-    T* result,
-    const T* vector1,
-    const typename GetTypeReal<T>::type* vector2)
+template <>
+void axpy_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int& N,
+                                                                       const std::complex<float>* alpha,
+                                                                       const std::complex<float>* X,
+                                                                       const int& incX,
+                                                                       std::complex<float>* Y,
+                                                                       const int& incY)
 {
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i < size)
-    {
-        result[i] = vector1[i] / vector2[i];
-    }
+    hipblasErrcheck(
+        hipblasCaxpy(cublas_handle, N, (hipblasComplex*)alpha, (hipblasComplex*)X, incX, (hipblasComplex*)Y, incY));
 }
 
-template <typename T, typename Real>
-__launch_bounds__(1024)
-__global__ void constantvector_addORsub_constantVector_kernel(
-    const int size,
-    T* result,
-    const T* vector1,
-    const Real constant1,
-    const T* vector2,
-    const Real constant2)
+template <>
+void axpy_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(const int& N,
+                                                                        const std::complex<double>* alpha,
+                                                                        const std::complex<double>* X,
+                                                                        const int& incX,
+                                                                        std::complex<double>* Y,
+                                                                        const int& incY)
 {
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i < size)
-    {
-        result[i] = vector1[i] * constant1 + vector2[i] * constant2;
-    }
+    hipblasErrcheck(hipblasZaxpy(cublas_handle,
+                                 N,
+                                 (hipblasDoubleComplex*)alpha,
+                                 (hipblasDoubleComplex*)X,
+                                 incX,
+                                 (hipblasDoubleComplex*)Y,
+                                 incY));
 }
 
 template <typename T>
@@ -328,304 +145,6 @@ __launch_bounds__(1024) __global__
     }
 }
 
-template <typename T>
-void line_minimize_with_block_op<T, base_device::DEVICE_GPU>::operator()(T* grad_out,
-                                                                         T* hgrad_out,
-                                                                         T* psi_out,
-                                                                         T* hpsi_out,
-                                                                         const int& n_basis,
-                                                                         const int& n_basis_max,
-                                                                         const int& n_band)
-{
-    auto A = reinterpret_cast<thrust::complex<Real>*>(grad_out);
-    auto B = reinterpret_cast<thrust::complex<Real>*>(hgrad_out);
-    auto C = reinterpret_cast<thrust::complex<Real>*>(psi_out);
-    auto D = reinterpret_cast<thrust::complex<Real>*>(hpsi_out);
-
-    line_minimize_with_block<Real><<<n_band, THREAD_PER_BLOCK>>>(
-            A, B, C, D,
-            n_basis, n_basis_max);
-
-    hipCheckOnDebug();
-}
-
-template <typename T>
-void calc_grad_with_block_op<T, base_device::DEVICE_GPU>::operator()(const Real* prec_in,
-                                                                     Real* err_out,
-                                                                     Real* beta_out,
-                                                                     T* psi_out,
-                                                                     T* hpsi_out,
-                                                                     T* grad_out,
-                                                                     T* grad_old_out,
-                                                                     const int& n_basis,
-                                                                     const int& n_basis_max,
-                                                                     const int& n_band)
-{
-    auto A = reinterpret_cast<thrust::complex<Real>*>(psi_out);
-    auto B = reinterpret_cast<thrust::complex<Real>*>(hpsi_out);
-    auto C = reinterpret_cast<thrust::complex<Real>*>(grad_out);
-    auto D = reinterpret_cast<thrust::complex<Real>*>(grad_old_out);
-
-    calc_grad_with_block<Real><<<n_band, THREAD_PER_BLOCK>>>(
-            prec_in, err_out, beta_out,
-            A, B, C, D,
-            n_basis, n_basis_max);
-
-    hipCheckOnDebug();
-}
-
-template <>
-double dot_real_op<double, base_device::DEVICE_GPU>::operator()(const int& dim,
-                                                                const double* psi_L,
-                                                                const double* psi_R,
-                                                                const bool reduce)
-{
-    double result = 0.0;
-    xdot_wrapper(dim, psi_L, 1, psi_R, 1, result);
-    if (reduce) {
-        Parallel_Reduce::reduce_pool(result);
-    }
-    return result;
-}
-
-// for this implementation, please check
-// https://thrust.github.io/doc/group__transformed__reductions_ga321192d85c5f510e52300ae762c7e995.html denghui modify
-// 2022-10-03 Note that ddot_(2*dim,a,1,           b,1) = REAL( zdotc_(dim,a,1,b,1) ) GPU specialization of actual computation.
-template <typename FPTYPE>
-inline FPTYPE dot_complex_wrapper(const int& dim,
-                                  const std::complex<FPTYPE>* psi_L,
-                                  const std::complex<FPTYPE>* psi_R,
-                                  const bool reduce)
-{
-    //<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-    // denghui modify 2022-10-07
-    // Note that  ddot_(2*dim,a,1,b,1) = REAL( zdotc_(dim,a,1,b,1) )
-    const FPTYPE* pL = reinterpret_cast<const FPTYPE*>(psi_L);
-    const FPTYPE* pR = reinterpret_cast<const FPTYPE*>(psi_R);
-    FPTYPE result = 0.0;
-    xdot_wrapper(dim * 2, pL, 1, pR, 1, result);
-    if (reduce) {
-        Parallel_Reduce::reduce_pool(result);
-    }
-    return result;
-}
-template <>
-float dot_real_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int& dim,
-                                                                            const std::complex<float>* psi_L,
-                                                                            const std::complex<float>* psi_R,
-                                                                            const bool reduce)
-{
-    return dot_complex_wrapper(dim, psi_L, psi_R, reduce);
-}
-template <>
-double dot_real_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(const int& dim,
-                                                                              const std::complex<double>* psi_L,
-                                                                              const std::complex<double>* psi_R,
-                                                                              const bool reduce)
-{
-    return dot_complex_wrapper(dim, psi_L, psi_R, reduce);
-}
-
-template <>
-void vector_div_constant_op<double, base_device::DEVICE_GPU>::operator()(const int dim,
-                                                                         double* result,
-                                                                         const double* vector,
-                                                                         const double constant)
-{
-    int thread = 1024;
-    int block = (dim + thread - 1) / thread;
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_div_constant_kernel<double>), dim3(block), dim3(thread), 0, 0, dim, result, vector, constant);
-
-    hipCheckOnDebug();
-}
-// vector operator: result[i] = vector[i] / constant
-template <typename FPTYPE>
-inline void vector_div_constant_complex_wrapper(const int dim,
-                                                std::complex<FPTYPE>* result,
-                                                const std::complex<FPTYPE>* vector,
-                                                const FPTYPE constant)
-{
-    thrust::complex<FPTYPE>* result_tmp = reinterpret_cast<thrust::complex<FPTYPE>*>(result);
-    const thrust::complex<FPTYPE>* vector_tmp = reinterpret_cast<const thrust::complex<FPTYPE>*>(vector);
-    int thread = 1024;
-    int block = (dim + thread - 1) / thread;
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_div_constant_kernel<thrust::complex<FPTYPE>>), dim3(block), dim3(thread), 0, 0, dim, result_tmp, vector_tmp, constant);
-
-    hipCheckOnDebug();
-}
-template <>
-void vector_div_constant_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int dim,
-                                                                                      std::complex<float>* result,
-                                                                                      const std::complex<float>* vector,
-                                                                                      const float constant)
-{
-    vector_div_constant_complex_wrapper(dim, result, vector, constant);
-
-    hipCheckOnDebug();
-}
-template <>
-void vector_div_constant_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(
-    const int dim,
-    std::complex<double>* result,
-    const std::complex<double>* vector,
-    const double constant)
-{
-    vector_div_constant_complex_wrapper(dim, result, vector, constant);
-
-    hipCheckOnDebug();
-}
-// vector operator: result[i] = vector1[i](not complex) * vector2[i](not complex)
-template <>
-void vector_mul_vector_op<double, base_device::DEVICE_GPU>::operator()(const int& dim,
-                                                                       double* result,
-                                                                       const double* vector1,
-                                                                       const double* vector2,
-                                                                       const bool& add)
-{
-    int thread = 1024;
-    int block = (dim + thread - 1) / thread;
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_mul_vector_kernel<double>), dim3(block), dim3(thread), 0, 0, dim, result, vector1, vector2, add);
-
-    hipCheckOnDebug();
-}
-
-// vector operator: result[i] = vector1[i](complex) * vector2[i](not complex)
-template <typename FPTYPE>
-inline void vector_mul_vector_complex_wrapper(const int& dim,
-                                              std::complex<FPTYPE>* result,
-                                              const std::complex<FPTYPE>* vector1,
-                                              const FPTYPE* vector2,
-                                              const bool& add)
-{
-    thrust::complex<FPTYPE>* result_tmp = reinterpret_cast<thrust::complex<FPTYPE>*>(result);
-    const thrust::complex<FPTYPE>* vector1_tmp = reinterpret_cast<const thrust::complex<FPTYPE>*>(vector1);
-    int thread = 1024;
-    int block = (dim + thread - 1) / thread;
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_mul_vector_kernel<thrust::complex<FPTYPE>>), dim3(block), dim3(thread), 0, 0, dim, result_tmp, vector1_tmp, vector2, add);
-
-    hipCheckOnDebug();
-}
-template <>
-void vector_mul_vector_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int& dim,
-                                                                                    std::complex<float>* result,
-                                                                                    const std::complex<float>* vector1,
-                                                                                    const float* vector2,
-                                                                                    const bool& add)
-{
-    vector_mul_vector_complex_wrapper(dim, result, vector1, vector2, add);
-}
-template <>
-void vector_mul_vector_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(
-    const int& dim,
-    std::complex<double>* result,
-    const std::complex<double>* vector1,
-    const double* vector2,
-    const bool& add)
-{
-    vector_mul_vector_complex_wrapper(dim, result, vector1, vector2, add);
-}
-// vector operator: result[i] = vector1[i](complex) / vector2[i](not complex)
-template <>
-void vector_div_vector_op<double, base_device::DEVICE_GPU>::operator()(const int& dim,
-                                                                       double* result,
-                                                                       const double* vector1,
-                                                                       const double* vector2)
-{
-    int thread = 1024;
-    int block = (dim + thread - 1) / thread;
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_div_vector_kernel<double>), dim3(block), dim3(thread), 0, 0, dim, result, vector1, vector2);
-
-    hipCheckOnDebug();
-}
-// vector operator: result[i] = vector1[i](complex) / vector2[i](not complex)
-template <typename FPTYPE>
-inline void vector_div_vector_op_complex_wrapper(const int& dim,
-                                                 std::complex<FPTYPE>* result,
-                                                 const std::complex<FPTYPE>* vector1,
-                                                 const FPTYPE* vector2)
-{
-    thrust::complex<FPTYPE>* result_tmp = reinterpret_cast<thrust::complex<FPTYPE>*>(result);
-    const thrust::complex<FPTYPE>* vector1_tmp = reinterpret_cast<const thrust::complex<FPTYPE>*>(vector1);
-    int thread = 1024;
-    int block = (dim + thread - 1) / thread;
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_div_vector_kernel<thrust::complex<FPTYPE>>), dim3(block), dim3(thread), 0, 0, dim, result_tmp, vector1_tmp, vector2);
-
-    hipCheckOnDebug();
-}
-template <>
-void vector_div_vector_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int& dim,
-                                                                                    std::complex<float>* result,
-                                                                                    const std::complex<float>* vector1,
-                                                                                    const float* vector2)
-{
-    vector_div_vector_op_complex_wrapper(dim, result, vector1, vector2);
-}
-template <>
-void vector_div_vector_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(
-    const int& dim,
-    std::complex<double>* result,
-    const std::complex<double>* vector1,
-    const double* vector2)
-{
-    vector_div_vector_op_complex_wrapper(dim, result, vector1, vector2);
-}
-
-// vector operator: result[i] = vector1[i] * constant1 + vector2[i] * constant2
-template <typename T>
-void constantvector_addORsub_constantVector_op<T, base_device::DEVICE_GPU>::operator()(const int& dim,
-                                                                                       T* result,
-                                                                                       const T* vector1,
-                                                                                       const Real constant1,
-                                                                                       const T* vector2,
-                                                                                       const Real constant2)
-{
-    using Type = typename GetTypeThrust<T>::type;
-    using Real = typename GetTypeReal<T>::type;
-
-    auto result_tmp = reinterpret_cast<Type*>(result);
-    auto vector1_tmp = reinterpret_cast<const Type*>(vector1);
-    auto vector2_tmp = reinterpret_cast<const Type*>(vector2);
-
-    int thread = 1024;
-    int block = (dim + thread - 1) / thread;
-    constantvector_addORsub_constantVector_kernel<Type, Real> <<<block, thread >>>(dim, result_tmp, vector1_tmp, constant1, vector2_tmp, constant2);
-
-    hipCheckOnDebug();
-}
-
-template <>
-void axpy_op<double, base_device::DEVICE_GPU>::operator()(const int& N,
-                                                          const double* alpha,
-                                                          const double* X,
-                                                          const int& incX,
-                                                          double* Y,
-                                                          const int& incY)
-{
-    hipblasErrcheck(hipblasDaxpy(cublas_handle, N, alpha, X, incX, Y, incY));
-}
-
-template <>
-void axpy_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int& N,
-                                                                       const std::complex<float>* alpha,
-                                                                       const std::complex<float>* X,
-                                                                       const int& incX,
-                                                                       std::complex<float>* Y,
-                                                                       const int& incY)
-{
-    hipblasErrcheck(hipblasCaxpy(cublas_handle, N, (hipblasComplex*)alpha, (hipblasComplex*)X, incX, (hipblasComplex*)Y, incY));
-}
-
-template <>
-void axpy_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(const int& N,
-                                                                        const std::complex<double>* alpha,
-                                                                        const std::complex<double>* X,
-                                                                        const int& incX,
-                                                                        std::complex<double>* Y,
-                                                                        const int& incY)
-{
-    hipblasErrcheck(hipblasZaxpy(cublas_handle, N, (hipblasDoubleComplex*)alpha, (hipblasDoubleComplex*)X, incX, (hipblasDoubleComplex*)Y, incY));
-}
-
 hipblasOperation_t judge_trans_op(bool is_complex, const char& trans, const char* name)
 {
     if (trans == 'N')
@@ -697,24 +216,6 @@ void gemv_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(const ch
     hipblasErrcheck(hipblasZgemv(cublas_handle, cutrans, m, n, (hipblasDoubleComplex*)alpha, (hipblasDoubleComplex*)A, lda, (hipblasDoubleComplex*)X, incx, (hipblasDoubleComplex*)beta, (hipblasDoubleComplex*)Y, incx));
 }
 
-template <>
-void scal_op<float, base_device::DEVICE_GPU>::operator()(const int& N,
-                                                         const std::complex<float>* alpha,
-                                                         std::complex<float>* X,
-                                                         const int& incx)
-{
-    hipblasErrcheck(hipblasCscal(cublas_handle, N, (hipblasComplex*)alpha, (hipblasComplex*)X, incx));
-}
-
-template <>
-void scal_op<double, base_device::DEVICE_GPU>::operator()(const int& N,
-                                                          const std::complex<double>* alpha,
-                                                          std::complex<double>* X,
-                                                          const int& incx)
-{
-    hipblasErrcheck(hipblasZscal(cublas_handle, N, (hipblasDoubleComplex*)alpha, (hipblasDoubleComplex*)X, incx));
-}
-
 template <>
 void gemm_op<float, base_device::DEVICE_GPU>::operator()(const char& transa,
                                                          const char& transb,
@@ -939,31 +440,7 @@ void matrixCopy<std::complex<double>, base_device::DEVICE_GPU>::operator()(const
 
 
 // Explicitly instantiate functors for the types of functor registered.
-template struct dot_real_op<std::complex<float>, base_device::DEVICE_GPU>;
-template struct calc_grad_with_block_op<std::complex<float>, base_device::DEVICE_GPU>;
-template struct line_minimize_with_block_op<std::complex<float>, base_device::DEVICE_GPU>;
-template struct vector_div_constant_op<std::complex<float>, base_device::DEVICE_GPU>;
-template struct vector_mul_vector_op<float, base_device::DEVICE_GPU>;
-template struct vector_mul_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
-template struct vector_div_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
-template struct constantvector_addORsub_constantVector_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct matrixCopy<double, base_device::DEVICE_GPU>;
 template struct matrixCopy<std::complex<float>, base_device::DEVICE_GPU>;
-
-template struct dot_real_op<std::complex<double>, base_device::DEVICE_GPU>;
-template struct calc_grad_with_block_op<std::complex<double>, base_device::DEVICE_GPU>;
-template struct line_minimize_with_block_op<std::complex<double>, base_device::DEVICE_GPU>;
-template struct vector_div_constant_op<std::complex<double>, base_device::DEVICE_GPU>;
-template struct vector_mul_vector_op<double, base_device::DEVICE_GPU>;
-template struct vector_mul_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
-template struct vector_div_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
-template struct constantvector_addORsub_constantVector_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct matrixCopy<std::complex<double>, base_device::DEVICE_GPU>;
-
-#ifdef __LCAO
-template struct dot_real_op<double, base_device::DEVICE_GPU>;
-template struct vector_div_constant_op<double, base_device::DEVICE_GPU>;
-template struct vector_div_vector_op<double, base_device::DEVICE_GPU>;
-template struct matrixCopy<double, base_device::DEVICE_GPU>;
-template struct constantvector_addORsub_constantVector_op<double, base_device::DEVICE_GPU>;
-#endif
 }  // namespace ModuleBase
diff --git a/source/module_base/kernels/rocm/math_kernel_op_vec.hip.cu b/source/module_base/kernels/rocm/math_kernel_op_vec.hip.cu
new file mode 100644
index 0000000000..4ba8ae8925
--- /dev/null
+++ b/source/module_base/kernels/rocm/math_kernel_op_vec.hip.cu
@@ -0,0 +1,376 @@
+#include "module_base/kernels/math_kernel_op.h"
+
+#include <base/macros/macros.h>
+#include <thrust/complex.h>
+template <>
+struct GetTypeReal<thrust::complex<float>> {
+    using type = float; /**< The return type specialization for std::complex<double>. */
+};
+template <>
+struct GetTypeReal<thrust::complex<double>> {
+    using type = double; /**< The return type specialization for std::complex<double>. */
+};
+namespace ModuleBase
+{
+void xdot_wrapper(const int &n, const float * x, const int &incx, const float * y, const int &incy, float &result);
+void xdot_wrapper(const int &n, const double * x, const int &incx, const double * y, const int &incy, double &result);
+
+// Define the CUDA kernel:
+template <typename T>
+__launch_bounds__(1024) __global__ void vector_mul_real_kernel(const int size,
+                                                               T* result,
+                                                               const T* vector,
+                                                               const typename GetTypeReal<T>::type constant)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < size)
+    {
+        result[i] = vector[i] * constant;
+    }
+}
+
+template <typename T>
+__launch_bounds__(1024) __global__ void vector_mul_vector_kernel(const int size,
+                                                                 T* result,
+                                                                 const T* vector1,
+                                                                 const typename GetTypeReal<T>::type* vector2,
+                                                                 const bool add)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < size)
+    {
+        if (add)
+        {
+            result[i] += vector1[i] * vector2[i];
+        }
+        else
+        {
+            result[i] = vector1[i] * vector2[i];
+        }
+    }
+}
+
+template <typename T>
+__launch_bounds__(1024) __global__ void vector_div_vector_kernel(const int size,
+                                                                 T* result,
+                                                                 const T* vector1,
+                                                                 const typename GetTypeReal<T>::type* vector2)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < size)
+    {
+        result[i] = vector1[i] / vector2[i];
+    }
+}
+
+template <typename T, typename Real>
+__launch_bounds__(1024) __global__ void constantvector_addORsub_constantVector_kernel(const int size,
+                                                                                      T* result,
+                                                                                      const T* vector1,
+                                                                                      const Real constant1,
+                                                                                      const T* vector2,
+                                                                                      const Real constant2)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < size)
+    {
+        result[i] = vector1[i] * constant1 + vector2[i] * constant2;
+    }
+}
+
+// vector operator: result[i] = vector[i] * constant
+template <>
+void vector_mul_real_op<double, base_device::DEVICE_GPU>::operator()(const int dim,
+                                                                     double* result,
+                                                                     const double* vector,
+                                                                     const double constant)
+{
+    int thread = 1024;
+    int block = (dim + thread - 1) / thread;
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_div_constant_kernel<double>),
+                       dim3(block),
+                       dim3(thread),
+                       0,
+                       0,
+                       dim,
+                       result,
+                       vector,
+                       constant);
+
+    hipCheckOnDebug();
+}
+template <typename FPTYPE>
+inline void vector_mul_real_wrapper(const int dim,
+                                    std::complex<FPTYPE>* result,
+                                    const std::complex<FPTYPE>* vector,
+                                    const FPTYPE constant)
+{
+    thrust::complex<FPTYPE>* result_tmp = reinterpret_cast<thrust::complex<FPTYPE>*>(result);
+    const thrust::complex<FPTYPE>* vector_tmp = reinterpret_cast<const thrust::complex<FPTYPE>*>(vector);
+    int thread = 1024;
+    int block = (dim + thread - 1) / thread;
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_mul_real_kernel<thrust::complex<FPTYPE>>),
+                       dim3(block),
+                       dim3(thread),
+                       0,
+                       0,
+                       dim,
+                       result_tmp,
+                       vector_tmp,
+                       constant);
+
+    hipCheckOnDebug();
+}
+template <>
+void vector_mul_real_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int dim,
+                                                                                  std::complex<float>* result,
+                                                                                  const std::complex<float>* vector,
+                                                                                  const float constant)
+{
+    vector_mul_real_wrapper(dim, result, vector, constant);
+
+    hipCheckOnDebug();
+}
+template <>
+void vector_mul_real_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(const int dim,
+                                                                                   std::complex<double>* result,
+                                                                                   const std::complex<double>* vector,
+                                                                                   const double constant)
+{
+    vector_mul_real_wrapper(dim, result, vector, constant);
+
+    hipCheckOnDebug();
+}
+
+// vector operator: result[i] = vector1[i](not complex) * vector2[i](not complex)
+template <>
+void vector_mul_vector_op<double, base_device::DEVICE_GPU>::operator()(const int& dim,
+                                                                       double* result,
+                                                                       const double* vector1,
+                                                                       const double* vector2,
+                                                                       const bool& add)
+{
+    int thread = 1024;
+    int block = (dim + thread - 1) / thread;
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_mul_vector_kernel<double>),
+                       dim3(block),
+                       dim3(thread),
+                       0,
+                       0,
+                       dim,
+                       result,
+                       vector1,
+                       vector2,
+                       add);
+
+    hipCheckOnDebug();
+}
+
+// vector operator: result[i] = vector1[i](complex) * vector2[i](not complex)
+template <typename FPTYPE>
+inline void vector_mul_vector_complex_wrapper(const int& dim,
+                                              std::complex<FPTYPE>* result,
+                                              const std::complex<FPTYPE>* vector1,
+                                              const FPTYPE* vector2,
+                                              const bool& add)
+{
+    thrust::complex<FPTYPE>* result_tmp = reinterpret_cast<thrust::complex<FPTYPE>*>(result);
+    const thrust::complex<FPTYPE>* vector1_tmp = reinterpret_cast<const thrust::complex<FPTYPE>*>(vector1);
+    int thread = 1024;
+    int block = (dim + thread - 1) / thread;
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_mul_vector_kernel<thrust::complex<FPTYPE>>),
+                       dim3(block),
+                       dim3(thread),
+                       0,
+                       0,
+                       dim,
+                       result_tmp,
+                       vector1_tmp,
+                       vector2,
+                       add);
+
+    hipCheckOnDebug();
+}
+template <>
+void vector_mul_vector_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int& dim,
+                                                                                    std::complex<float>* result,
+                                                                                    const std::complex<float>* vector1,
+                                                                                    const float* vector2,
+                                                                                    const bool& add)
+{
+    vector_mul_vector_complex_wrapper(dim, result, vector1, vector2, add);
+}
+template <>
+void vector_mul_vector_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(
+    const int& dim,
+    std::complex<double>* result,
+    const std::complex<double>* vector1,
+    const double* vector2,
+    const bool& add)
+{
+    vector_mul_vector_complex_wrapper(dim, result, vector1, vector2, add);
+}
+
+// vector operator: result[i] = vector1[i](complex) / vector2[i](not complex)
+template <>
+void vector_div_vector_op<double, base_device::DEVICE_GPU>::operator()(const int& dim,
+                                                                       double* result,
+                                                                       const double* vector1,
+                                                                       const double* vector2)
+{
+    int thread = 1024;
+    int block = (dim + thread - 1) / thread;
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_div_vector_kernel<double>),
+                       dim3(block),
+                       dim3(thread),
+                       0,
+                       0,
+                       dim,
+                       result,
+                       vector1,
+                       vector2);
+
+    hipCheckOnDebug();
+}
+// vector operator: result[i] = vector1[i](complex) / vector2[i](not complex)
+template <typename FPTYPE>
+inline void vector_div_vector_op_complex_wrapper(const int& dim,
+                                                 std::complex<FPTYPE>* result,
+                                                 const std::complex<FPTYPE>* vector1,
+                                                 const FPTYPE* vector2)
+{
+    thrust::complex<FPTYPE>* result_tmp = reinterpret_cast<thrust::complex<FPTYPE>*>(result);
+    const thrust::complex<FPTYPE>* vector1_tmp = reinterpret_cast<const thrust::complex<FPTYPE>*>(vector1);
+    int thread = 1024;
+    int block = (dim + thread - 1) / thread;
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_div_vector_kernel<thrust::complex<FPTYPE>>),
+                       dim3(block),
+                       dim3(thread),
+                       0,
+                       0,
+                       dim,
+                       result_tmp,
+                       vector1_tmp,
+                       vector2);
+
+    hipCheckOnDebug();
+}
+template <>
+void vector_div_vector_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int& dim,
+                                                                                    std::complex<float>* result,
+                                                                                    const std::complex<float>* vector1,
+                                                                                    const float* vector2)
+{
+    vector_div_vector_op_complex_wrapper(dim, result, vector1, vector2);
+}
+template <>
+void vector_div_vector_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(
+    const int& dim,
+    std::complex<double>* result,
+    const std::complex<double>* vector1,
+    const double* vector2)
+{
+    vector_div_vector_op_complex_wrapper(dim, result, vector1, vector2);
+}
+
+// vector operator: result[i] = vector1[i] * constant1 + vector2[i] * constant2
+template <typename T>
+void constantvector_addORsub_constantVector_op<T, base_device::DEVICE_GPU>::operator()(const int& dim,
+                                                                                       T* result,
+                                                                                       const T* vector1,
+                                                                                       const Real constant1,
+                                                                                       const T* vector2,
+                                                                                       const Real constant2)
+{
+    using Type = typename GetTypeThrust<T>::type;
+    using Real = typename GetTypeReal<T>::type;
+
+    auto result_tmp = reinterpret_cast<Type*>(result);
+    auto vector1_tmp = reinterpret_cast<const Type*>(vector1);
+    auto vector2_tmp = reinterpret_cast<const Type*>(vector2);
+
+    int thread = 1024;
+    int block = (dim + thread - 1) / thread;
+    constantvector_addORsub_constantVector_kernel<Type, Real>
+        <<<block, thread>>>(dim, result_tmp, vector1_tmp, constant1, vector2_tmp, constant2);
+
+    hipCheckOnDebug();
+}
+
+template <>
+double dot_real_op<double, base_device::DEVICE_GPU>::operator()(const int& dim,
+                                                                const double* psi_L,
+                                                                const double* psi_R,
+                                                                const bool reduce)
+{
+    double result = 0.0;
+    xdot_wrapper(dim, psi_L, 1, psi_R, 1, result);
+    if (reduce)
+    {
+        Parallel_Reduce::reduce_pool(result);
+    }
+    return result;
+}
+
+// for this implementation, please check
+// https://thrust.github.io/doc/group__transformed__reductions_ga321192d85c5f510e52300ae762c7e995.html denghui modify
+// 2022-10-03 Note that ddot_(2*dim,a,1,           b,1) = REAL( zdotc_(dim,a,1,b,1) ) GPU specialization of actual
+// computation.
+template <typename FPTYPE>
+inline FPTYPE dot_complex_wrapper(const int& dim,
+                                  const std::complex<FPTYPE>* psi_L,
+                                  const std::complex<FPTYPE>* psi_R,
+                                  const bool reduce)
+{
+    //<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+    // denghui modify 2022-10-07
+    // Note that  ddot_(2*dim,a,1,b,1) = REAL( zdotc_(dim,a,1,b,1) )
+    const FPTYPE* pL = reinterpret_cast<const FPTYPE*>(psi_L);
+    const FPTYPE* pR = reinterpret_cast<const FPTYPE*>(psi_R);
+    FPTYPE result = 0.0;
+    xdot_wrapper(dim * 2, pL, 1, pR, 1, result);
+    if (reduce)
+    {
+        Parallel_Reduce::reduce_pool(result);
+    }
+    return result;
+}
+template <>
+float dot_real_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int& dim,
+                                                                            const std::complex<float>* psi_L,
+                                                                            const std::complex<float>* psi_R,
+                                                                            const bool reduce)
+{
+    return dot_complex_wrapper(dim, psi_L, psi_R, reduce);
+}
+template <>
+double dot_real_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(const int& dim,
+                                                                              const std::complex<double>* psi_L,
+                                                                              const std::complex<double>* psi_R,
+                                                                              const bool reduce)
+{
+    return dot_complex_wrapper(dim, psi_L, psi_R, reduce);
+}
+
+// Explicitly instantiate functors for the types of functor registered.
+template struct vector_mul_real_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct vector_mul_real_op<double, base_device::DEVICE_GPU>;
+template struct vector_mul_real_op<std::complex<double>, base_device::DEVICE_GPU>;
+
+template struct vector_mul_vector_op<float, base_device::DEVICE_GPU>;
+template struct vector_mul_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct vector_mul_vector_op<double, base_device::DEVICE_GPU>;
+template struct vector_mul_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
+template struct vector_div_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct vector_div_vector_op<double, base_device::DEVICE_GPU>;
+template struct vector_div_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
+
+template struct constantvector_addORsub_constantVector_op<float, base_device::DEVICE_GPU>;
+template struct constantvector_addORsub_constantVector_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct constantvector_addORsub_constantVector_op<double, base_device::DEVICE_GPU>;
+template struct constantvector_addORsub_constantVector_op<std::complex<double>, base_device::DEVICE_GPU>;
+
+template struct dot_real_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct dot_real_op<double, base_device::DEVICE_GPU>;
+template struct dot_real_op<std::complex<double>, base_device::DEVICE_GPU>;
+} // namespace ModuleBase
\ No newline at end of file
diff --git a/source/module_base/kernels/rocm/math_op.hip.cu b/source/module_base/kernels/rocm/math_ylm_op.hip.cu
similarity index 99%
rename from source/module_base/kernels/rocm/math_op.hip.cu
rename to source/module_base/kernels/rocm/math_ylm_op.hip.cu
index 12e43bdde4..8cfdac4a39 100644
--- a/source/module_base/kernels/rocm/math_op.hip.cu
+++ b/source/module_base/kernels/rocm/math_ylm_op.hip.cu
@@ -1,4 +1,4 @@
-#include "module_base/kernels/math_op.h"
+#include "module_base/kernels/math_ylm_op.h"
 
 #include <hip/hip_runtime.h>
 #include <base/macros/macros.h>
diff --git a/source/module_base/kernels/test/CMakeLists.txt b/source/module_base/kernels/test/CMakeLists.txt
index 1453545d14..c1cbf6771f 100644
--- a/source/module_base/kernels/test/CMakeLists.txt
+++ b/source/module_base/kernels/test/CMakeLists.txt
@@ -3,5 +3,5 @@ remove_definitions(-D__MPI)
 AddTest(
     TARGET Base_Kernels_UTs
     LIBS parameter ${math_libs} base device 
-    SOURCES math_op_test.cpp math_kernel_test.cpp
+    SOURCES math_ylm_op_test.cpp math_kernel_test.cpp
 )
diff --git a/source/module_base/kernels/test/math_kernel_test.cpp b/source/module_base/kernels/test/math_kernel_test.cpp
index 9490563743..b8cfb68387 100644
--- a/source/module_base/kernels/test/math_kernel_test.cpp
+++ b/source/module_base/kernels/test/math_kernel_test.cpp
@@ -72,7 +72,7 @@ class TestModuleHsolverMathKernel : public ::testing::Test
 
     // haozhihan add
     // cpu operator
-    using vector_div_constant_op_cpu = ModuleBase::vector_div_constant_op<std::complex<double>, base_device::DEVICE_CPU>;
+    using vector_mul_real_op_cpu = ModuleBase::vector_mul_real_op<std::complex<double>, base_device::DEVICE_CPU>;
     using vector_mul_vector_op_cpu = ModuleBase::vector_mul_vector_op<std::complex<double>, base_device::DEVICE_CPU>;
     using vector_div_vector_op_cpu = ModuleBase::vector_div_vector_op<std::complex<double>, base_device::DEVICE_CPU>;
     using constantvector_addORsub_constantVector_op_cpu
@@ -81,7 +81,7 @@ class TestModuleHsolverMathKernel : public ::testing::Test
     using scal_op_cpu = ModuleBase::scal_op<double, base_device::DEVICE_CPU>;
     using gemv_op_cpu = ModuleBase::gemv_op<std::complex<double>, base_device::DEVICE_CPU>;
     // gpu operator
-    using vector_div_constant_op_gpu = ModuleBase::vector_div_constant_op<std::complex<double>, base_device::DEVICE_GPU>;
+    using vector_mul_real_op_gpu = ModuleBase::vector_mul_real_op<std::complex<double>, base_device::DEVICE_GPU>;
     using vector_mul_vector_op_gpu = ModuleBase::vector_mul_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
     using vector_div_vector_op_gpu = ModuleBase::vector_div_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
     using constantvector_addORsub_constantVector_op_gpu
@@ -117,10 +117,10 @@ class TestModuleHsolverMathKernel : public ::testing::Test
                                       {1.41257916e+00, 5.45282609e-01},
                                       {-1.29333636e-01, -5.04228492e-03}};
 
-    // (1) for test vector_div_constant_op
+    // (1) for test vector_mul_real_op
     const std::vector<std::complex<double>> input = L;
     const double constant = 5.5;
-    const std::vector<std::complex<double>> output_vector_div_constant_op = {{-0.11893203, -0.13492526},
+    const std::vector<std::complex<double>> output_vector_mul_real_op = {{-0.11893203, -0.13492526},
                                                                              {-0.40314756, 0.07734553},
                                                                              {0.61158728, -0.45754102},
                                                                              {-0.54274745, -0.09682102},
@@ -261,14 +261,14 @@ TEST_F(TestModuleHsolverMathKernel, zdot_real_op_cpu)
     EXPECT_LT(fabs(result - expected_result), 1e-12);
 }
 
-TEST_F(TestModuleHsolverMathKernel, vector_div_constant_op_cpu)
+TEST_F(TestModuleHsolverMathKernel, vector_mul_real_op_cpu)
 {
     std::vector<std::complex<double>> output(input.size());
-    vector_div_constant_op_cpu()(dim, output.data(), input.data(), constant);
+    vector_mul_real_op_cpu()(dim, output.data(), input.data(), 1.0 / constant);
     for (int i = 0; i < input.size(); i++)
     {
-        EXPECT_LT(fabs(output[i].imag() - output_vector_div_constant_op[i].imag()), 1e-8);
-        EXPECT_LT(fabs(output[i].real() - output_vector_div_constant_op[i].real()), 1e-8);
+        EXPECT_LT(fabs(output[i].imag() - output_vector_mul_real_op[i].imag()), 1e-8);
+        EXPECT_LT(fabs(output[i].real() - output_vector_mul_real_op[i].real()), 1e-8);
     }
 }
 
@@ -381,7 +381,7 @@ TEST_F(TestModuleHsolverMathKernel, zdot_real_op_gpu)
     delete_memory_op()(psi_R_dev);
 }
 
-TEST_F(TestModuleHsolverMathKernel, vector_div_constant_op_gpu)
+TEST_F(TestModuleHsolverMathKernel, vector_mul_real_op_gpu)
 {
     // in CPU
     std::vector<std::complex<double>> output(input.size());
@@ -393,14 +393,14 @@ TEST_F(TestModuleHsolverMathKernel, vector_div_constant_op_gpu)
     // syn the input data in CPU to GPU
     synchronize_memory_op()(input_dev, input.data(), input.size());
     // run
-    vector_div_constant_op_gpu()(dim, output_dev, input_dev, constant);
+    vector_mul_real_op_gpu()(dim, output_dev, input_dev, 1.0 / constant);
     // syn the output data in GPU to CPU
     synchronize_memory_op_gpu()(output.data(), output_dev, output.size());
 
     for (int i = 0; i < input.size(); i++)
     {
-        EXPECT_LT(fabs(output[i].imag() - output_vector_div_constant_op[i].imag()), 1e-8);
-        EXPECT_LT(fabs(output[i].real() - output_vector_div_constant_op[i].real()), 1e-8);
+        EXPECT_LT(fabs(output[i].imag() - output_vector_mul_real_op[i].imag()), 1e-8);
+        EXPECT_LT(fabs(output[i].real() - output_vector_mul_real_op[i].real()), 1e-8);
     }
     delete_memory_op()(input_dev);
     delete_memory_op()(output_dev);
diff --git a/source/module_base/kernels/test/math_op_test.cpp b/source/module_base/kernels/test/math_ylm_op_test.cpp
similarity index 99%
rename from source/module_base/kernels/test/math_op_test.cpp
rename to source/module_base/kernels/test/math_ylm_op_test.cpp
index cfdedb234e..5da30d8111 100644
--- a/source/module_base/kernels/test/math_op_test.cpp
+++ b/source/module_base/kernels/test/math_ylm_op_test.cpp
@@ -1,4 +1,4 @@
-#include "module_base/kernels/math_op.h"
+#include "module_base/kernels/math_ylm_op.h"
 
 #include "module_base/module_device/memory_op.h"
 
diff --git a/source/module_base/math_ylmreal.cpp b/source/module_base/math_ylmreal.cpp
index fac76cf959..69c79dc089 100644
--- a/source/module_base/math_ylmreal.cpp
+++ b/source/module_base/math_ylmreal.cpp
@@ -1,7 +1,7 @@
 #include "math_ylmreal.h"
 
 #include "constants.h"
-#include "module_base/kernels/math_op.h"
+#include "module_base/kernels/math_ylm_op.h"
 #include "module_base/libm/libm.h"
 #include "module_base/module_device/memory_op.h"
 #include "module_base/array_pool.h"
diff --git a/source/module_hsolver/diago_bpcg.cpp b/source/module_hsolver/diago_bpcg.cpp
index 2a03fe13b9..83914874e5 100644
--- a/source/module_hsolver/diago_bpcg.cpp
+++ b/source/module_hsolver/diago_bpcg.cpp
@@ -1,18 +1,17 @@
-#include <limits>
-
 #include "module_hsolver/diago_bpcg.h"
 
-#include <ATen/kernels/blas.h>
-#include <ATen/kernels/lapack.h>
-
-#include <ATen/ops/einsum_op.h>
-
 #include "diago_iter_assist.h"
 #include "module_base/blas_connector.h"
 #include "module_base/global_function.h"
 #include "module_base/kernels/math_kernel_op.h"
+#include "module_hsolver/kernels/bpcg_kernel_op.h"
 #include "para_linear_transform.h"
 
+#include <ATen/kernels/blas.h>
+#include <ATen/kernels/lapack.h>
+#include <ATen/ops/einsum_op.h>
+#include <limits>
+
 namespace hsolver {
 
 template<typename T, typename Device>
@@ -100,7 +99,13 @@ void DiagoBPCG<T, Device>::line_minimize(
     ct::Tensor& psi_out,
     ct::Tensor& hpsi_out)
 {
-    line_minimize_with_block_op()(grad_in.data<T>(), hgrad_in.data<T>(), psi_out.data<T>(), hpsi_out.data<T>(), this->n_dim, this->n_basis, this->n_band_l);
+    line_minimize_with_block_op<T, Device>()(grad_in.data<T>(),
+                                             hgrad_in.data<T>(),
+                                             psi_out.data<T>(),
+                                             hpsi_out.data<T>(),
+                                             this->n_dim,
+                                             this->n_basis,
+                                             this->n_band_l);
 }
 
 
@@ -138,17 +143,16 @@ void DiagoBPCG<T, Device>::calc_grad_with_block(
         ct::Tensor& grad_out,
         ct::Tensor& grad_old_out)
 {
-    calc_grad_with_block_op()(
-			prec_in.data<Real>(), 
-			err_out.data<Real>(), 
-			beta_out.data<Real>(), 
-			psi_in.data<T>(), 
-			hpsi_in.data<T>(), 
-			grad_out.data<T>(), 
-			grad_old_out.data<T>(), 
-			this->n_dim, 
-			this->n_basis, 
-			this->n_band_l);
+    calc_grad_with_block_op<T, Device>()(prec_in.data<Real>(),
+                                         err_out.data<Real>(),
+                                         beta_out.data<Real>(),
+                                         psi_in.data<T>(),
+                                         hpsi_in.data<T>(),
+                                         grad_out.data<T>(),
+                                         grad_old_out.data<T>(),
+                                         this->n_dim,
+                                         this->n_basis,
+                                         this->n_band_l);
 }
 
 template<typename T, typename Device>
diff --git a/source/module_hsolver/diago_bpcg.h b/source/module_hsolver/diago_bpcg.h
index 243ac8b44f..27d8e6a0ae 100644
--- a/source/module_hsolver/diago_bpcg.h
+++ b/source/module_hsolver/diago_bpcg.h
@@ -349,8 +349,6 @@ class DiagoBPCG
     // note: these operators use template parameter base_device::Device_*
     // defined in module_base/module_device/types.h
     // different from ct_Device!
-    using calc_grad_with_block_op = ModuleBase::calc_grad_with_block_op<T, Device>;
-    using line_minimize_with_block_op = ModuleBase::line_minimize_with_block_op<T, Device>;
     using gemm_op = ModuleBase::gemm_op<T, Device>;
 
 };
diff --git a/source/module_hsolver/diago_cg.cpp b/source/module_hsolver/diago_cg.cpp
index 08d5d914ba..9eba7c2bda 100644
--- a/source/module_hsolver/diago_cg.cpp
+++ b/source/module_hsolver/diago_cg.cpp
@@ -555,7 +555,7 @@ void DiagoCG<T, Device>::schmit_orth(const int& m, const ct::Tensor& psi, const
     // {
     //     pphi_m[ig] /= psi_norm;
     // }
-    ModuleBase::vector_div_constant_op<T, Device>()(this->n_basis_, phi_m.data<T>(), phi_m.data<T>(), psi_norm);
+    ModuleBase::vector_mul_real_op<T, Device>()(this->n_basis_, phi_m.data<T>(), phi_m.data<T>(), Real(1.0 / psi_norm));
 
     // ModuleBase::timer::tick("DiagoCG","schmit_orth");
 }
diff --git a/source/module_hsolver/diago_dav_subspace.cpp b/source/module_hsolver/diago_dav_subspace.cpp
index 09ff916917..9e8aa2f066 100644
--- a/source/module_hsolver/diago_dav_subspace.cpp
+++ b/source/module_hsolver/diago_dav_subspace.cpp
@@ -366,20 +366,18 @@ void Diago_DavSubspace<T, Device>::cal_grad(const HPsiFunc& hpsi_func,
     }
 
     // "normalize!!!" in order to improve numerical stability of subspace diagonalization
-    std::vector<Real> psi_norm(notconv, 0.0);
     for (size_t i = 0; i < notconv; i++)
     {
-        psi_norm[i] = ModuleBase::dot_real_op<T, Device>()(this->dim,
+        Real psi_norm = ModuleBase::dot_real_op<T, Device>()(this->dim,
                                                            psi_iter + (nbase + i) * this->dim,
                                                            psi_iter + (nbase + i) * this->dim,
                                                            true);
-        assert(psi_norm[i] > 0.0);
-        psi_norm[i] = sqrt(psi_norm[i]);
-
-        ModuleBase::vector_div_constant_op<T, Device>()(this->dim,
-                                                        psi_iter + (nbase + i) * this->dim,
-                                                        psi_iter + (nbase + i) * this->dim,
-                                                        psi_norm[i]);
+        assert(psi_norm > 0.0);
+        psi_norm = sqrt(psi_norm);
+        ModuleBase::vector_mul_real_op<T, Device>()(this->dim,
+                                                       psi_iter + (nbase + i) * this->dim,
+                                                       psi_iter + (nbase + i) * this->dim,
+                                                       Real(1.0 / psi_norm));
     }
 
     // update hpsi[:, nbase:nbase+notconv]
diff --git a/source/module_hsolver/diago_david.cpp b/source/module_hsolver/diago_david.cpp
index 504c3a2053..59c40c9fa4 100644
--- a/source/module_hsolver/diago_david.cpp
+++ b/source/module_hsolver/diago_david.cpp
@@ -966,7 +966,7 @@ void DiagoDavid<T, Device>::SchmidtOrth(const int& dim,
     else
     {
         // psi_m = psi_m / psi_norm
-        ModuleBase::vector_div_constant_op<T, Device>()(dim, psi_m, psi_m, psi_norm);
+        ModuleBase::vector_mul_real_op<T, Device>()(dim, psi_m, psi_m, Real(1.0 / psi_norm));
         // for (int i = 0; i < npw; i++)
         // {
         //     psi_m[i] /= psi_norm;
diff --git a/source/module_hsolver/kernels/bpcg_kernel_op.cpp b/source/module_hsolver/kernels/bpcg_kernel_op.cpp
new file mode 100644
index 0000000000..bf05ac94e1
--- /dev/null
+++ b/source/module_hsolver/kernels/bpcg_kernel_op.cpp
@@ -0,0 +1,113 @@
+#include "module_hsolver/kernels/bpcg_kernel_op.h"
+#include "module_base/blas_connector.h"
+#include "module_base/kernels/math_kernel_op.h"
+#include "module_base/parallel_reduce.h"
+namespace hsolver
+{
+
+template <typename T>
+struct line_minimize_with_block_op<T, base_device::DEVICE_CPU>
+{
+    using Real = typename GetTypeReal<T>::type;
+    void operator()(T* grad_out,
+                    T* hgrad_out,
+                    T* psi_out,
+                    T* hpsi_out,
+                    const int& n_basis,
+                    const int& n_basis_max,
+                    const int& n_band)
+    {
+        for (int band_idx = 0; band_idx < n_band; band_idx++)
+        {
+            Real epsilo_0 = 0.0, epsilo_1 = 0.0, epsilo_2 = 0.0;
+            Real theta = 0.0, cos_theta = 0.0, sin_theta = 0.0;
+            auto A = reinterpret_cast<const Real*>(grad_out + band_idx * n_basis_max);
+            Real norm = BlasConnector::dot(2 * n_basis, A, 1, A, 1);
+            Parallel_Reduce::reduce_pool(norm);
+            norm = 1.0 / sqrt(norm);
+            for (int basis_idx = 0; basis_idx < n_basis; basis_idx++)
+            {
+                auto item = band_idx * n_basis_max + basis_idx;
+                grad_out[item] *= norm;
+                hgrad_out[item] *= norm;
+                epsilo_0 += std::real(hpsi_out[item] * std::conj(psi_out[item]));
+                epsilo_1 += std::real(grad_out[item] * std::conj(hpsi_out[item]));
+                epsilo_2 += std::real(grad_out[item] * std::conj(hgrad_out[item]));
+            }
+            Parallel_Reduce::reduce_pool(epsilo_0);
+            Parallel_Reduce::reduce_pool(epsilo_1);
+            Parallel_Reduce::reduce_pool(epsilo_2);
+            theta = 0.5 * std::abs(std::atan(2 * epsilo_1 / (epsilo_0 - epsilo_2)));
+            cos_theta = std::cos(theta);
+            sin_theta = std::sin(theta);
+            for (int basis_idx = 0; basis_idx < n_basis; basis_idx++)
+            {
+                auto item = band_idx * n_basis_max + basis_idx;
+                psi_out[item] = psi_out[item] * cos_theta + grad_out[item] * sin_theta;
+                hpsi_out[item] = hpsi_out[item] * cos_theta + hgrad_out[item] * sin_theta;
+            }
+        }
+    }
+};
+
+template <typename T>
+struct calc_grad_with_block_op<T, base_device::DEVICE_CPU>
+{
+    using Real = typename GetTypeReal<T>::type;
+    void operator()(const Real* prec_in,
+                    Real* err_out,
+                    Real* beta_out,
+                    T* psi_out,
+                    T* hpsi_out,
+                    T* grad_out,
+                    T* grad_old_out,
+                    const int& n_basis,
+                    const int& n_basis_max,
+                    const int& n_band)
+    {
+        for (int band_idx = 0; band_idx < n_band; band_idx++)
+        {
+            Real err = 0.0;
+            Real beta = 0.0;
+            Real epsilo = 0.0;
+            Real grad_2 = {0.0};
+            T grad_1 = {0.0, 0.0};
+            auto A = reinterpret_cast<const Real*>(psi_out + band_idx * n_basis_max);
+            Real norm = BlasConnector::dot(2 * n_basis, A, 1, A, 1);
+            Parallel_Reduce::reduce_pool(norm);
+            norm = 1.0 / sqrt(norm);
+            for (int basis_idx = 0; basis_idx < n_basis; basis_idx++)
+            {
+                auto item = band_idx * n_basis_max + basis_idx;
+                psi_out[item] *= norm;
+                hpsi_out[item] *= norm;
+                epsilo += std::real(hpsi_out[item] * std::conj(psi_out[item]));
+            }
+            Parallel_Reduce::reduce_pool(epsilo);
+            for (int basis_idx = 0; basis_idx < n_basis; basis_idx++)
+            {
+                auto item = band_idx * n_basis_max + basis_idx;
+                grad_1 = hpsi_out[item] - epsilo * psi_out[item];
+                grad_2 = std::norm(grad_1);
+                err += grad_2;
+                beta += grad_2 / prec_in[basis_idx]; /// Mark here as we should div the prec?
+            }
+            Parallel_Reduce::reduce_pool(err);
+            Parallel_Reduce::reduce_pool(beta);
+            for (int basis_idx = 0; basis_idx < n_basis; basis_idx++)
+            {
+                auto item = band_idx * n_basis_max + basis_idx;
+                grad_1 = hpsi_out[item] - epsilo * psi_out[item];
+                grad_out[item] = -grad_1 / prec_in[basis_idx] + beta / beta_out[band_idx] * grad_old_out[item];
+            }
+            beta_out[band_idx] = beta;
+            err_out[band_idx] = sqrt(err);
+        }
+    }
+};
+
+template struct calc_grad_with_block_op<std::complex<float>, base_device::DEVICE_CPU>;
+template struct line_minimize_with_block_op<std::complex<float>, base_device::DEVICE_CPU>;
+template struct calc_grad_with_block_op<std::complex<double>, base_device::DEVICE_CPU>;
+template struct line_minimize_with_block_op<std::complex<double>, base_device::DEVICE_CPU>;
+} // namespace hsolver
\ No newline at end of file
diff --git a/source/module_hsolver/kernels/bpcg_kernel_op.h b/source/module_hsolver/kernels/bpcg_kernel_op.h
new file mode 100644
index 0000000000..fcf8f99267
--- /dev/null
+++ b/source/module_hsolver/kernels/bpcg_kernel_op.h
@@ -0,0 +1,84 @@
+#ifndef MODULE_HSOLVER_BPCG_KERNEL_H
+#define MODULE_HSOLVER_BPCG_KERNEL_H
+#include "module_base/macros.h"
+#include "module_base/module_device/types.h"
+namespace hsolver
+{
+
+template <typename T, typename Device>
+struct line_minimize_with_block_op
+{
+    /// @brief dot_real_op computes the dot product of the given complex
+    /// arrays(treated as float arrays). And there's may have MPI communications
+    /// while enabling planewave parallization strategy.
+    ///
+    /// Input Parameters
+    /// \param dev : the type of computing device
+    /// \param A : input array arr
+    /// \param dim : size of A
+    /// \param lda : leading dimention of A
+    /// \param batch : batch size, the size of the result array res
+    ///
+    /// \return res : the result vector
+    /// T : dot product result
+    void operator()(T* grad_out,
+                    T* hgrad_out,
+                    T* psi_out,
+                    T* hpsi_out,
+                    const int& n_basis,
+                    const int& n_basis_max,
+                    const int& n_band);
+};
+
+template <typename T, typename Device>
+struct calc_grad_with_block_op
+{
+    /// @brief dot_real_op computes the dot product of the given complex
+    /// arrays(treated as float arrays). And there's may have MPI communications
+    /// while enabling planewave parallization strategy.
+    ///
+    /// Input Parameters
+    /// \param dev : the type of computing device
+    /// \param A : input array arr
+    /// \param dim : size of A
+    /// \param lda : leading dimention of A
+    /// \param batch : batch size, the size of the result array res
+    ///
+    /// \return res : the result vector
+    /// T : dot product result
+    using Real = typename GetTypeReal<T>::type;
+    void operator()(const Real* prec_in,
+                    Real* err_out,
+                    Real* beta_out,
+                    T* psi_out,
+                    T* hpsi_out,
+                    T* grad_out,
+                    T* grad_old_out,
+                    const int& n_basis,
+                    const int& n_basis_max,
+                    const int& n_band);
+};
+
+#if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
+
+template <typename T>
+struct line_minimize_with_block_op<T, base_device::DEVICE_GPU> {
+  using Real = typename GetTypeReal<T>::type;
+  void operator()(T *grad_out, T *hgrad_out, T *psi_out, T *hpsi_out,
+                  const int &n_basis, const int &n_basis_max,
+                  const int &n_band);
+};
+
+template <typename T>
+struct calc_grad_with_block_op<T, base_device::DEVICE_GPU> {
+  using Real = typename GetTypeReal<T>::type;
+  void operator()(const Real *prec_in, Real *err_out, Real *beta_out,
+                  T *psi_out, T *hpsi_out, T *grad_out, T *grad_old_out,
+                  const int &n_basis, const int &n_basis_max,
+                  const int &n_band);
+};
+
+#endif
+} // namespace hsolver
+
+#endif
\ No newline at end of file
diff --git a/source/module_hsolver/kernels/cuda/bpcg_kernel_op.cu b/source/module_hsolver/kernels/cuda/bpcg_kernel_op.cu
new file mode 100644
index 0000000000..d851265a1b
--- /dev/null
+++ b/source/module_hsolver/kernels/cuda/bpcg_kernel_op.cu
@@ -0,0 +1,300 @@
+#include "module_base/kernels/math_kernel_op.h"
+#include "module_hsolver/kernels/bpcg_kernel_op.h"
+
+#include <base/macros/macros.h>
+#include <thrust/complex.h>
+namespace hsolver
+{
+const int warp_size = 32;
+const int thread_per_block = 256;
+
+template <typename Real>
+__global__ void line_minimize_with_block(
+        thrust::complex<Real>* grad,
+        thrust::complex<Real>* hgrad,
+        thrust::complex<Real>* psi,
+        thrust::complex<Real>* hpsi,
+        const int n_basis,
+        const int n_basis_max)
+{
+    int band_idx = blockIdx.x; // band_idx
+    int tid = threadIdx.x; // basis_idx
+    int item = 0;
+    Real epsilo_0 = 0.0, epsilo_1 = 0.0, epsilo_2 = 0.0;
+    Real theta = 0.0, cos_theta = 0.0, sin_theta = 0.0;
+    __shared__ Real data[thread_per_block * 3];
+
+    data[tid] = 0;
+
+    for (int basis_idx = tid; basis_idx < n_basis; basis_idx += thread_per_block) {
+        item = band_idx * n_basis_max + basis_idx;
+        data[tid] += (grad[item] * thrust::conj(grad[item])).real();
+    }
+    __syncthreads();
+    // just do some parallel reduction in shared memory
+    for (int ii = thread_per_block >> 1; ii > warp_size; ii >>= 1) {
+        if (tid < ii) {
+            data[tid] += data[tid + ii];
+        }
+        __syncthreads();
+    }
+
+    // For threads in the same warp, it is better that they process the same work
+    // Also, __syncwarp() should be used instead of __syncthreads()
+    // Therefore we unroll the loop and ensure that the threads does the same work
+    if (tid < warp_size) {
+        data[tid] += data[tid + 32]; __syncwarp();
+        data[tid] += data[tid + 16]; __syncwarp();
+        data[tid] += data[tid + 8]; __syncwarp();
+        data[tid] += data[tid + 4]; __syncwarp();
+        data[tid] += data[tid + 2]; __syncwarp();
+        data[tid] += data[tid + 1]; __syncwarp();
+    }
+
+    __syncthreads();
+
+    Real norm = 1.0 / sqrt(data[0]);
+    __syncthreads();
+
+    data[tid] = 0;
+    data[thread_per_block + tid] = 0;
+    data[2 * thread_per_block + tid] = 0;
+    for (int basis_idx = tid; basis_idx < n_basis; basis_idx += thread_per_block) {
+        item = band_idx * n_basis_max + basis_idx;
+        grad[item] *= norm;
+        hgrad[item] *= norm;
+        data[tid] += (hpsi[item] * thrust::conj(psi[item])).real();
+        data[thread_per_block + tid] += (grad[item] * thrust::conj(hpsi[item])).real();
+        data[2 * thread_per_block + tid] += (grad[item] * thrust::conj(hgrad[item])).real();
+    }
+    __syncthreads();
+
+    // just do some parallel reduction in shared memory
+    for (int ii = thread_per_block >> 1; ii > warp_size; ii >>= 1) {
+        if (tid < ii) {
+            data[tid] += data[tid + ii];
+            data[thread_per_block + tid] += data[thread_per_block + tid + ii];
+            data[2 * thread_per_block + tid] += data[2 * thread_per_block + tid + ii];
+        }
+        __syncthreads();
+    }
+    if (tid < warp_size) {
+        data[tid] += data[tid + 32]; __syncwarp();
+        data[tid] += data[tid + 16]; __syncwarp();
+        data[tid] += data[tid + 8]; __syncwarp();
+        data[tid] += data[tid + 4]; __syncwarp();
+        data[tid] += data[tid + 2]; __syncwarp();
+        data[tid] += data[tid + 1]; __syncwarp();
+        data[thread_per_block + tid] += data[thread_per_block + tid + 32]; __syncwarp();
+        data[thread_per_block + tid] += data[thread_per_block + tid + 16]; __syncwarp();
+        data[thread_per_block + tid] += data[thread_per_block + tid + 8]; __syncwarp();
+        data[thread_per_block + tid] += data[thread_per_block + tid + 4]; __syncwarp();
+        data[thread_per_block + tid] += data[thread_per_block + tid + 2]; __syncwarp();
+        data[thread_per_block + tid] += data[thread_per_block + tid + 1]; __syncwarp();
+        data[2 * thread_per_block + tid] += data[2 * thread_per_block + tid + 32]; __syncwarp();
+        data[2 * thread_per_block + tid] += data[2 * thread_per_block + tid + 16]; __syncwarp();
+        data[2 * thread_per_block + tid] += data[2 * thread_per_block + tid + 8]; __syncwarp();
+        data[2 * thread_per_block + tid] += data[2 * thread_per_block + tid + 4]; __syncwarp();
+        data[2 * thread_per_block + tid] += data[2 * thread_per_block + tid + 2]; __syncwarp();
+        data[2 * thread_per_block + tid] += data[2 * thread_per_block + tid + 1]; __syncwarp();
+    }
+    __syncthreads();
+    epsilo_0 = data[0];
+    epsilo_1 = data[thread_per_block];
+    epsilo_2 = data[2 * thread_per_block];
+
+    theta = 0.5 * abs(atan(2 * epsilo_1/(epsilo_0 - epsilo_2)));
+    cos_theta = cos(theta);
+    sin_theta = sin(theta);
+    for (int basis_idx = tid; basis_idx < n_basis; basis_idx += thread_per_block) {
+        item = band_idx * n_basis_max + basis_idx;
+        psi [item] = psi [item] * cos_theta + grad [item] * sin_theta;
+        hpsi[item] = hpsi[item] * cos_theta + hgrad[item] * sin_theta;
+    }
+}
+
+template <typename Real>
+__global__ void calc_grad_with_block(
+        const Real* prec,
+        Real* err,
+        Real* beta,
+        thrust::complex<Real>* psi,
+        thrust::complex<Real>* hpsi,
+        thrust::complex<Real>* grad,
+        thrust::complex<Real>* grad_old,
+        const int n_basis,
+        const int n_basis_max)
+{
+    int band_idx = blockIdx.x; // band_idx
+    int tid = threadIdx.x; // basis_idx
+    int item = 0;
+    Real err_st = 0.0;
+    Real beta_st = 0.0;
+    Real epsilo = 0.0;
+    Real grad_2 = 0.0;
+    thrust::complex<Real> grad_1 = {0, 0};
+    __shared__ Real data[thread_per_block * 2];
+
+    // Init shared memory
+    data[tid] = 0;
+
+    for (int basis_idx = tid; basis_idx < n_basis; basis_idx += thread_per_block) {
+        item = band_idx * n_basis_max + basis_idx;
+        data[tid] += (psi[item] * thrust::conj(psi[item])).real();
+    }
+    __syncthreads();
+    // just do some parallel reduction in shared memory
+    for (int ii = thread_per_block >> 1; ii > warp_size; ii >>= 1) {
+        if (tid < ii) {
+            data[tid] += data[tid + ii];
+        }
+        __syncthreads();
+    }
+
+    if (tid < warp_size) {
+        data[tid] += data[tid + 32]; __syncwarp();
+        data[tid] += data[tid + 16]; __syncwarp();
+        data[tid] += data[tid + 8]; __syncwarp();
+        data[tid] += data[tid + 4]; __syncwarp();
+        data[tid] += data[tid + 2]; __syncwarp();
+        data[tid] += data[tid + 1]; __syncwarp();
+    }
+
+    __syncthreads();
+
+    Real norm = 1.0 / sqrt(data[0]);
+    __syncthreads();
+
+    data[tid] = 0;
+    for (int basis_idx = tid; basis_idx < n_basis; basis_idx += thread_per_block) {
+        item = band_idx * n_basis_max + basis_idx;
+        psi[item] *= norm;
+        hpsi[item] *= norm;
+        data[tid] += (hpsi[item] * thrust::conj(psi[item])).real();
+    }
+    __syncthreads();
+
+    // just do some parallel reduction in shared memory
+    for (int ii = thread_per_block >> 1; ii > warp_size; ii >>= 1) {
+        if (tid < ii) {
+            data[tid] += data[tid + ii];
+        }
+        __syncthreads();
+    }
+
+    if (tid < warp_size) {
+        data[tid] += data[tid + 32]; __syncwarp();
+        data[tid] += data[tid + 16]; __syncwarp();
+        data[tid] += data[tid + 8]; __syncwarp();
+        data[tid] += data[tid + 4]; __syncwarp();
+        data[tid] += data[tid + 2]; __syncwarp();
+        data[tid] += data[tid + 1]; __syncwarp();
+    }
+
+    __syncthreads();
+    epsilo = data[0];
+    __syncthreads();
+
+    data[tid] = 0;
+    data[thread_per_block + tid] = 0;
+    for (int basis_idx = tid; basis_idx < n_basis; basis_idx += thread_per_block) {
+        item = band_idx * n_basis_max + basis_idx;
+        grad_1 = hpsi[item] - epsilo * psi[item];
+        grad_2 = thrust::norm(grad_1);
+        data[tid] += grad_2;
+        data[thread_per_block + tid] += grad_2 / prec[basis_idx];
+    }
+    __syncthreads();
+
+    // just do some parallel reduction in shared memory
+    for (int ii = thread_per_block >> 1; ii > warp_size; ii >>= 1) {
+        if (tid < ii) {
+            data[tid] += data[tid + ii];
+            data[thread_per_block + tid] += data[thread_per_block + tid + ii];
+        }
+        __syncthreads();
+    }
+
+    if (tid < warp_size) {
+        data[tid] += data[tid + 32]; __syncwarp();
+        data[tid] += data[tid + 16]; __syncwarp();
+        data[tid] += data[tid + 8]; __syncwarp();
+        data[tid] += data[tid + 4]; __syncwarp();
+        data[tid] += data[tid + 2]; __syncwarp();
+        data[tid] += data[tid + 1]; __syncwarp();
+        data[thread_per_block + tid] += data[thread_per_block + tid + 32]; __syncwarp();
+        data[thread_per_block + tid] += data[thread_per_block + tid + 16]; __syncwarp();
+        data[thread_per_block + tid] += data[thread_per_block + tid + 8]; __syncwarp();
+        data[thread_per_block + tid] += data[thread_per_block + tid + 4]; __syncwarp();
+        data[thread_per_block + tid] += data[thread_per_block + tid + 2]; __syncwarp();
+        data[thread_per_block + tid] += data[thread_per_block + tid + 1]; __syncwarp();
+    }
+
+    __syncthreads();
+    err_st = data[0];
+    beta_st = data[thread_per_block];
+    for (int basis_idx = tid; basis_idx < n_basis; basis_idx += thread_per_block) {
+        item = band_idx * n_basis_max + basis_idx;
+        grad_1 = hpsi[item] - epsilo * psi[item];
+        grad[item] = -grad_1 / prec[basis_idx] + beta_st / beta[band_idx] * grad_old[item];
+    }
+
+    __syncthreads();
+    if (tid == 0) {
+        beta[band_idx] = beta_st;
+        err[band_idx] = sqrt(err_st);
+    }
+}
+
+template <typename T>
+void line_minimize_with_block_op<T, base_device::DEVICE_GPU>::operator()(T* grad_out,
+                                                                         T* hgrad_out,
+                                                                         T* psi_out,
+                                                                         T* hpsi_out,
+                                                                         const int& n_basis,
+                                                                         const int& n_basis_max,
+                                                                         const int& n_band)
+{
+    auto A = reinterpret_cast<thrust::complex<Real>*>(grad_out);
+    auto B = reinterpret_cast<thrust::complex<Real>*>(hgrad_out);
+    auto C = reinterpret_cast<thrust::complex<Real>*>(psi_out);
+    auto D = reinterpret_cast<thrust::complex<Real>*>(hpsi_out);
+
+    line_minimize_with_block<Real><<<n_band, thread_per_block>>>(
+            A, B, C, D,
+            n_basis, n_basis_max);
+
+    cudaCheckOnDebug();
+}
+
+template <typename T>
+void calc_grad_with_block_op<T, base_device::DEVICE_GPU>::operator()(const Real* prec_in,
+                                                                     Real* err_out,
+                                                                     Real* beta_out,
+                                                                     T* psi_out,
+                                                                     T* hpsi_out,
+                                                                     T* grad_out,
+                                                                     T* grad_old_out,
+                                                                     const int& n_basis,
+                                                                     const int& n_basis_max,
+                                                                     const int& n_band)
+{
+    auto A = reinterpret_cast<thrust::complex<Real>*>(psi_out);
+    auto B = reinterpret_cast<thrust::complex<Real>*>(hpsi_out);
+    auto C = reinterpret_cast<thrust::complex<Real>*>(grad_out);
+    auto D = reinterpret_cast<thrust::complex<Real>*>(grad_old_out);
+
+    calc_grad_with_block<Real><<<n_band, thread_per_block>>>(
+            prec_in, err_out, beta_out,
+            A, B, C, D,
+            n_basis, n_basis_max);
+
+    cudaCheckOnDebug();
+}
+
+template struct calc_grad_with_block_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct line_minimize_with_block_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct calc_grad_with_block_op<std::complex<double>, base_device::DEVICE_GPU>;
+template struct line_minimize_with_block_op<std::complex<double>, base_device::DEVICE_GPU>;
+
+}
\ No newline at end of file
diff --git a/source/module_hsolver/kernels/rocm/bpcg_kernel_op.hip.cu b/source/module_hsolver/kernels/rocm/bpcg_kernel_op.hip.cu
new file mode 100644
index 0000000000..850b48bf2e
--- /dev/null
+++ b/source/module_hsolver/kernels/rocm/bpcg_kernel_op.hip.cu
@@ -0,0 +1,224 @@
+#include "module_base/kernels/math_kernel_op.h"
+#include "module_hsolver/kernels/bpcg_kernel_op.h"
+
+#include <base/macros/macros.h>
+#include <thrust/complex.h>
+#define WARP_SIZE 32
+#define THREAD_PER_BLOCK 256
+namespace hsolver
+{
+template <typename Real>
+__global__ void line_minimize_with_block(
+        thrust::complex<Real>* grad,
+        thrust::complex<Real>* hgrad,
+        thrust::complex<Real>* psi,
+        thrust::complex<Real>* hpsi,
+        const int n_basis,
+        const int n_basis_max)
+{
+    int band_idx = blockIdx.x; // band_idx
+    int tid = threadIdx.x; // basis_idx
+    int item = 0;
+    Real epsilo_0 = 0.0, epsilo_1 = 0.0, epsilo_2 = 0.0;
+    Real theta = 0.0, cos_theta = 0.0, sin_theta = 0.0;
+    __shared__ Real data[THREAD_PER_BLOCK * 3];
+
+    data[tid] = 0;
+
+    for (int basis_idx = tid; basis_idx < n_basis; basis_idx += THREAD_PER_BLOCK) {
+        item = band_idx * n_basis_max + basis_idx;
+        data[tid] += (grad[item] * thrust::conj(grad[item])).real();
+    }
+    __syncthreads();
+    // just do some parallel reduction in shared memory
+    for (int ii = THREAD_PER_BLOCK >> 1; ii > 0; ii >>= 1) {
+        if (tid < ii) {
+            data[tid] += data[tid + ii];
+        }
+        __syncthreads();
+    }
+
+    Real norm = 1.0 / sqrt(data[0]);
+    __syncthreads();
+
+    data[tid] = 0;
+    data[THREAD_PER_BLOCK + tid] = 0;
+    data[2 * THREAD_PER_BLOCK + tid] = 0;
+    for (int basis_idx = tid; basis_idx < n_basis; basis_idx += THREAD_PER_BLOCK) {
+        item = band_idx * n_basis_max + basis_idx;
+        grad[item] *= norm;
+        hgrad[item] *= norm;
+        data[tid] += (hpsi[item] * thrust::conj(psi[item])).real();
+        data[THREAD_PER_BLOCK + tid] += (grad[item] * thrust::conj(hpsi[item])).real();
+        data[2 * THREAD_PER_BLOCK + tid] += (grad[item] * thrust::conj(hgrad[item])).real();
+    }
+    __syncthreads();
+
+    // just do some parallel reduction in shared memory
+    for (int ii = THREAD_PER_BLOCK >> 1; ii > 0; ii >>= 1) {
+        if (tid < ii) {
+            data[tid] += data[tid + ii];
+            data[THREAD_PER_BLOCK + tid] += data[THREAD_PER_BLOCK + tid + ii];
+            data[2 * THREAD_PER_BLOCK + tid] += data[2 * THREAD_PER_BLOCK + tid + ii];
+        }
+        __syncthreads();
+    }
+    epsilo_0 = data[0];
+    epsilo_1 = data[THREAD_PER_BLOCK];
+    epsilo_2 = data[2 * THREAD_PER_BLOCK];
+
+    theta = 0.5 * abs(atan(2 * epsilo_1/(epsilo_0 - epsilo_2)));
+    cos_theta = cos(theta);
+    sin_theta = sin(theta);
+    for (int basis_idx = tid; basis_idx < n_basis; basis_idx += THREAD_PER_BLOCK) {
+        item = band_idx * n_basis_max + basis_idx;
+        psi [item] = psi [item] * cos_theta + grad [item] * sin_theta;
+        hpsi[item] = hpsi[item] * cos_theta + hgrad[item] * sin_theta;
+    }
+}
+
+template <typename Real>
+__global__ void calc_grad_with_block(
+        const Real* prec,
+        Real* err,
+        Real* beta,
+        thrust::complex<Real>* psi,
+        thrust::complex<Real>* hpsi,
+        thrust::complex<Real>* grad,
+        thrust::complex<Real>* grad_old,
+        const int n_basis,
+        const int n_basis_max)
+{
+    int band_idx = blockIdx.x; // band_idx
+    int tid = threadIdx.x; // basis_idx
+    int item = 0;
+    Real err_st = 0.0;
+    Real beta_st = 0.0;
+    Real epsilo = 0.0;
+    Real grad_2 = 0.0;
+    thrust::complex<Real> grad_1 = {0, 0};
+    __shared__ Real data[THREAD_PER_BLOCK * 2];
+
+    // Init shared memory
+    data[tid] = 0;
+
+    for (int basis_idx = tid; basis_idx < n_basis; basis_idx += THREAD_PER_BLOCK) {
+        item = band_idx * n_basis_max + basis_idx;
+        data[tid] += (psi[item] * thrust::conj(psi[item])).real();
+    }
+    __syncthreads();
+    // just do some parallel reduction in shared memory
+    for (int ii = THREAD_PER_BLOCK >> 1; ii > 0; ii >>= 1) {
+        if (tid < ii) {
+            data[tid] += data[tid + ii];
+        }
+        __syncthreads();
+    }
+
+    Real norm = 1.0 / sqrt(data[0]);
+    __syncthreads();
+
+    data[tid] = 0;
+    for (int basis_idx = tid; basis_idx < n_basis; basis_idx += THREAD_PER_BLOCK) {
+        item = band_idx * n_basis_max + basis_idx;
+        psi[item] *= norm;
+        hpsi[item] *= norm;
+        data[tid] += (hpsi[item] * thrust::conj(psi[item])).real();
+    }
+    __syncthreads();
+
+    // just do some parallel reduction in shared memory
+    for (int ii = THREAD_PER_BLOCK >> 1; ii > 0; ii >>= 1) {
+        if (tid < ii) {
+            data[tid] += data[tid + ii];
+        }
+        __syncthreads();
+    }
+    epsilo = data[0];
+    __syncthreads();
+
+    data[tid] = 0;
+    data[THREAD_PER_BLOCK + tid] = 0;
+    for (int basis_idx = tid; basis_idx < n_basis; basis_idx += THREAD_PER_BLOCK) {
+        item = band_idx * n_basis_max + basis_idx;
+        grad_1 = hpsi[item] - epsilo * psi[item];
+        grad_2 = thrust::norm(grad_1);
+        data[tid] += grad_2;
+        data[THREAD_PER_BLOCK + tid] += grad_2 / prec[basis_idx];
+    }
+    __syncthreads();
+
+    // just do some parallel reduction in shared memory
+    for (int ii = THREAD_PER_BLOCK >> 1; ii > 0; ii >>= 1) {
+        if (tid < ii) {
+            data[tid] += data[tid + ii];
+            data[THREAD_PER_BLOCK + tid] += data[THREAD_PER_BLOCK + tid + ii];
+        }
+        __syncthreads();
+    }
+    err_st = data[0];
+    beta_st = data[THREAD_PER_BLOCK];
+    for (int basis_idx = tid; basis_idx < n_basis; basis_idx += THREAD_PER_BLOCK) {
+        item = band_idx * n_basis_max + basis_idx;
+        grad_1 = hpsi[item] - epsilo * psi[item];
+        grad[item] = -grad_1 / prec[basis_idx] + beta_st / beta[band_idx] * grad_old[item];
+    }
+
+    __syncthreads();
+    if (tid == 0) {
+        beta[band_idx] = beta_st;
+        err[band_idx] = sqrt(err_st);
+    }
+}
+
+template <typename T>
+void line_minimize_with_block_op<T, base_device::DEVICE_GPU>::operator()(T* grad_out,
+                                                                         T* hgrad_out,
+                                                                         T* psi_out,
+                                                                         T* hpsi_out,
+                                                                         const int& n_basis,
+                                                                         const int& n_basis_max,
+                                                                         const int& n_band)
+{
+    auto A = reinterpret_cast<thrust::complex<Real>*>(grad_out);
+    auto B = reinterpret_cast<thrust::complex<Real>*>(hgrad_out);
+    auto C = reinterpret_cast<thrust::complex<Real>*>(psi_out);
+    auto D = reinterpret_cast<thrust::complex<Real>*>(hpsi_out);
+
+    line_minimize_with_block<Real><<<n_band, THREAD_PER_BLOCK>>>(
+            A, B, C, D,
+            n_basis, n_basis_max);
+
+    hipCheckOnDebug();
+}
+
+template <typename T>
+void calc_grad_with_block_op<T, base_device::DEVICE_GPU>::operator()(const Real* prec_in,
+                                                                     Real* err_out,
+                                                                     Real* beta_out,
+                                                                     T* psi_out,
+                                                                     T* hpsi_out,
+                                                                     T* grad_out,
+                                                                     T* grad_old_out,
+                                                                     const int& n_basis,
+                                                                     const int& n_basis_max,
+                                                                     const int& n_band)
+{
+    auto A = reinterpret_cast<thrust::complex<Real>*>(psi_out);
+    auto B = reinterpret_cast<thrust::complex<Real>*>(hpsi_out);
+    auto C = reinterpret_cast<thrust::complex<Real>*>(grad_out);
+    auto D = reinterpret_cast<thrust::complex<Real>*>(grad_old_out);
+
+    calc_grad_with_block<Real><<<n_band, THREAD_PER_BLOCK>>>(
+            prec_in, err_out, beta_out,
+            A, B, C, D,
+            n_basis, n_basis_max);
+
+    hipCheckOnDebug();
+}
+
+template struct calc_grad_with_block_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct line_minimize_with_block_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct calc_grad_with_block_op<std::complex<double>, base_device::DEVICE_GPU>;
+template struct line_minimize_with_block_op<std::complex<double>, base_device::DEVICE_GPU>;
+}
\ No newline at end of file
diff --git a/source/module_hsolver/kernels/test/perf_math_kernel.cpp b/source/module_hsolver/kernels/test/perf_math_kernel.cpp
index bb69ca7ac7..f9f0228c1f 100644
--- a/source/module_hsolver/kernels/test/perf_math_kernel.cpp
+++ b/source/module_hsolver/kernels/test/perf_math_kernel.cpp
@@ -17,7 +17,7 @@
 /**
  * Tested function: 
  *      - zdot_real_cpu_op
- *      - vector_div_constant_op_cpu
+ *      - vector_mul_real_op_cpu
  *      - vector_mul_vector_op_cpu
  *      - vector_div_vector_op_cpu
  *      - constantvector_addORsub_constantVector_op_cpu
@@ -25,7 +25,7 @@
  *      - scal_cpu
 
  *      - zdot_real_gpu_op
- *      - vector_div_constant_op_gpu
+ *      - vector_mul_real_op_gpu
  *      - vector_mul_vector_op_gpu
  *      - vector_div_vector_op_gpu
  *      - constantvector_addORsub_constantVector_op_gpu
@@ -134,7 +134,7 @@ class PerfModuleHsolverMathKernel : public benchmark::Fixture {
     // CPU operator
     using zdot_real_cpu_op = ModuleBase::dot_real_op<std::complex<double>, base_device::DEVICE_CPU>;
     
-    using vector_div_constant_op_cpu = ModuleBase::vector_div_constant_op<std::complex<double>, base_device::DEVICE_CPU>;
+    using vector_mul_real_op_cpu = ModuleBase::vector_mul_real_op<std::complex<double>, base_device::DEVICE_CPU>;
     using vector_mul_vector_op_cpu = ModuleBase::vector_mul_vector_op<std::complex<double>, base_device::DEVICE_CPU>;
     using vector_div_vector_op_cpu = ModuleBase::vector_div_vector_op<std::complex<double>, base_device::DEVICE_CPU>;
     using constantvector_addORsub_constantVector_op_cpu
@@ -148,7 +148,7 @@ class PerfModuleHsolverMathKernel : public benchmark::Fixture {
     // GPU operator
     using zdot_real_gpu_op = ModuleBase::dot_real_op<std::complex<double>, base_device::DEVICE_GPU>;
 
-    using vector_div_constant_op_gpu = ModuleBase::vector_div_constant_op<std::complex<double>, base_device::DEVICE_GPU>;
+    using vector_mul_real_op_gpu = ModuleBase::vector_mul_real_op<std::complex<double>, base_device::DEVICE_GPU>;
     using vector_mul_vector_op_gpu = ModuleBase::vector_mul_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
     using vector_div_vector_op_gpu = ModuleBase::vector_div_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
     using constantvector_addORsub_constantVector_op_gpu
@@ -167,9 +167,9 @@ BENCHMARK_DEFINE_F(PerfModuleHsolverMathKernel, BM_zdot_real_cpu_op)(benchmark::
 }
 
 
-BENCHMARK_DEFINE_F(PerfModuleHsolverMathKernel, BM_vector_div_constant_op_cpu)(benchmark::State& state) {
+BENCHMARK_DEFINE_F(PerfModuleHsolverMathKernel, BM_vector_mul_real_op_cpu)(benchmark::State& state) {
     for (auto _ : state) {
-        vector_div_constant_op_cpu()(dim_vector, result_zvector, test_zvector_a, dconstant_a);
+        vector_mul_real_op_cpu()(dim_vector, result_zvector, test_zvector_a, dconstant_a);
     }
 }
 
@@ -205,7 +205,7 @@ BENCHMARK_DEFINE_F(PerfModuleHsolverMathKernel, BM_scal_op_cpu)(benchmark::State
 
 
 BENCHMARK_REGISTER_F(PerfModuleHsolverMathKernel, BM_zdot_real_cpu_op)->RangeMultiplier(10)->Range(1,10e6)->Unit(benchmark::kMicrosecond);
-BENCHMARK_REGISTER_F(PerfModuleHsolverMathKernel, BM_vector_div_constant_op_cpu)->RangeMultiplier(10)->Range(1,10e6)->Unit(benchmark::kMicrosecond);
+BENCHMARK_REGISTER_F(PerfModuleHsolverMathKernel, BM_vector_mul_real_op_cpu)->RangeMultiplier(10)->Range(1,10e6)->Unit(benchmark::kMicrosecond);
 BENCHMARK_REGISTER_F(PerfModuleHsolverMathKernel, BM_vector_mul_vector_op_cpu)->RangeMultiplier(10)->Range(1,10e6)->Unit(benchmark::kMicrosecond);
 BENCHMARK_REGISTER_F(PerfModuleHsolverMathKernel, BM_vector_div_vector_op_cpu)->RangeMultiplier(10)->Range(1,10e6)->Unit(benchmark::kMicrosecond);
 BENCHMARK_REGISTER_F(PerfModuleHsolverMathKernel, BM_constantvector_addORsub_constantVector_op_cpu)->RangeMultiplier(10)->Range(1,10e6)->Unit(benchmark::kMicrosecond);
@@ -236,9 +236,9 @@ BENCHMARK_DEFINE_F(PerfModuleHsolverMathKernel, BM_zdot_real_gpu_op)(benchmark::
     }
 }
 
-BENCHMARK_DEFINE_F(PerfModuleHsolverMathKernel, BM_vector_div_constant_op_gpu)(benchmark::State& state) {
+BENCHMARK_DEFINE_F(PerfModuleHsolverMathKernel, BM_vector_mul_real_op_gpu)(benchmark::State& state) {
     for (auto _ : state) {
-        vector_div_constant_op_gpu()(dim_vector, result_zvector_gpu, test_zvector_a_gpu, dconstant_a);
+        vector_mul_real_op_gpu()(dim_vector, result_zvector_gpu, test_zvector_a_gpu, dconstant_a);
     }
 }
 
@@ -276,7 +276,7 @@ BENCHMARK_DEFINE_F(PerfModuleHsolverMathKernel, BM_scal_op_gpu)(benchmark::State
 // BENCHMARK_REGISTER_F(PerfModuleHsolverMathKernel, BM_zdot_real_gpu_op)->RangeMultiplier(10)->Range(1,10e6)->UseManualTime()->Unit(benchmark::kMicrosecond);
 
 BENCHMARK_REGISTER_F(PerfModuleHsolverMathKernel, BM_zdot_real_gpu_op)->RangeMultiplier(10)->Range(1,10e6)->Unit(benchmark::kMicrosecond);
-BENCHMARK_REGISTER_F(PerfModuleHsolverMathKernel, BM_vector_div_constant_op_gpu)->RangeMultiplier(10)->Range(1,10e6)->Unit(benchmark::kMicrosecond);
+BENCHMARK_REGISTER_F(PerfModuleHsolverMathKernel, BM_vector_mul_real_op_gpu)->RangeMultiplier(10)->Range(1,10e6)->Unit(benchmark::kMicrosecond);
 BENCHMARK_REGISTER_F(PerfModuleHsolverMathKernel, BM_vector_mul_vector_op_gpu)->RangeMultiplier(10)->Range(1,10e6)->Unit(benchmark::kMicrosecond);
 BENCHMARK_REGISTER_F(PerfModuleHsolverMathKernel, BM_vector_div_vector_op_gpu)->RangeMultiplier(10)->Range(1,10e6)->Unit(benchmark::kMicrosecond);
 BENCHMARK_REGISTER_F(PerfModuleHsolverMathKernel, BM_constantvector_addORsub_constantVector_op_gpu)->RangeMultiplier(10)->Range(1,10e6)->Unit(benchmark::kMicrosecond);
diff --git a/tests/integrate/101_PW_upf201_uspp_NaCl/result.ref b/tests/integrate/101_PW_upf201_uspp_NaCl/result.ref
index 6896763a6c..f53a4b9f43 100644
--- a/tests/integrate/101_PW_upf201_uspp_NaCl/result.ref
+++ b/tests/integrate/101_PW_upf201_uspp_NaCl/result.ref
@@ -1,7 +1,7 @@
 etotref -1675.0621003323110472
 etotperatomref -837.5310501662
-totalforceref 9.445612
-totalstressref 1646.076135
+totalforceref 9.443968
+totalstressref 1645.827919
 pointgroupref C_2v
 spacegroupref C_2v
 nksibzref 5
diff --git a/tests/integrate/102_PW_PINT_UKS/result.ref b/tests/integrate/102_PW_PINT_UKS/result.ref
index 476cf3e367..9db5bb132a 100644
--- a/tests/integrate/102_PW_PINT_UKS/result.ref
+++ b/tests/integrate/102_PW_PINT_UKS/result.ref
@@ -1,3 +1,3 @@
-etotref -6147.553111898429
-etotperatomref -3073.7765559492
+etotref -6147.5531114
+etotperatomref -3073.7765557
 totaltimeref 14.32
diff --git a/tests/integrate/107_PW_outWfcR/result.ref b/tests/integrate/107_PW_outWfcR/result.ref
index 1880e707f7..c610d46c6f 100644
--- a/tests/integrate/107_PW_outWfcR/result.ref
+++ b/tests/integrate/107_PW_outWfcR/result.ref
@@ -1,8 +1,8 @@
 etotref -197.1405644417868
 etotperatomref -98.5702822209
 variance_wfc_r_0_0 0.31340
-variance_wfc_r_0_1 1.71055
-variance_wfc_r_0_2 2.39603
+variance_wfc_r_0_1 1.71056
+variance_wfc_r_0_2 2.39604
 variance_wfc_r_0_3 1.66607
 variance_wfc_r_0_4 1.05190
 variance_wfc_r_0_5 1.29386