fix compile

Qianruipku · Qianruipku · commit 767e4d471d0c · 2025-03-18T17:04:11.000+08:00
diff --git a/python/pyabacus/CONTRIBUTING.md b/python/pyabacus/CONTRIBUTING.md
@@ -190,6 +190,7 @@ list(APPEND _diago
     ${HSOLVER_PATH}/diag_const_nums.cpp
     ${HSOLVER_PATH}/diago_iter_assist.cpp
     ${HSOLVER_PATH}/kernels/dngvd_op.cpp
+    ${HSOLVER_PATH}/kernels/bpcg_kernel_op.cpp
     ${BASE_PATH}/kernels/math_kernel_op.cpp
     ${BASE_PATH}/kernels/math_kernel_op_vec.cpp
     ${BASE_PATH}/kernels/math_ylm_op.cpp
diff --git a/python/pyabacus/src/hsolver/CMakeLists.txt b/python/pyabacus/src/hsolver/CMakeLists.txt
@@ -10,6 +10,7 @@ list(APPEND _diago
 
 
     ${HSOLVER_PATH}/kernels/dngvd_op.cpp
+    ${HSOLVER_PATH}/kernels/bpcg_kernel_op.cpp
     # dependency
     ${BASE_PATH}/kernels/math_kernel_op.cpp
     ${BASE_PATH}/kernels/math_kernel_op_vec.cpp
diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
@@ -36,6 +36,7 @@ list(APPEND device_srcs
   module_hamilt_pw/hamilt_stodft/kernels/hpsi_norm_op.cpp
   module_basis/module_pw/kernels/pw_op.cpp
   module_hsolver/kernels/dngvd_op.cpp
+  module_hsolver/kernels/bpcg_kernel_op.cpp
   module_elecstate/kernels/elecstate_op.cpp
 
   # module_psi/kernels/psi_memory_op.cpp
@@ -65,6 +66,7 @@ if(USE_CUDA)
     module_hamilt_pw/hamilt_pwdft/kernels/cuda/onsite_op.cu
     module_basis/module_pw/kernels/cuda/pw_op.cu
     module_hsolver/kernels/cuda/dngvd_op.cu
+    module_hsolver/kernels/cuda/bpcg_kernel_op.cu
     module_elecstate/kernels/cuda/elecstate_op.cu
 
     # module_psi/kernels/cuda/memory_op.cu
@@ -91,6 +93,7 @@ if(USE_ROCM)
     module_hamilt_pw/hamilt_stodft/kernels/rocm/hpsi_norm_op.hip.cu
     module_basis/module_pw/kernels/rocm/pw_op.hip.cu
     module_hsolver/kernels/rocm/dngvd_op.hip.cu
+    module_hsolver/kernels/rocm/bpcg_kernel_op.hip.cu
     module_elecstate/kernels/rocm/elecstate_op.hip.cu
 
     # module_psi/kernels/rocm/memory_op.hip.cu
diff --git a/source/Makefile.Objects b/source/Makefile.Objects
@@ -350,6 +350,7 @@ OBJS_HSOLVER=diago_cg.o\
     hsolver_pw_sdft.o\
     diago_iter_assist.o\
     dngvd_op.o\
+    bpcg_kernel_op.o\
     diag_const_nums.o\
     diag_hs_para.o\
     diago_pxxxgvx.o\
diff --git a/source/module_base/kernels/cuda/math_kernel_op.cu b/source/module_base/kernels/cuda/math_kernel_op.cu
@@ -16,14 +16,6 @@ const int warp_size = 32;
 const int thread_per_block = 256;
 }
 
-template <>
-struct GetTypeReal<thrust::complex<float>> {
-    using type = float; /**< The return type specialization for std::complex<double>. */
-};
-template <>
-struct GetTypeReal<thrust::complex<double>> {
-    using type = double; /**< The return type specialization for std::complex<double>. */
-};
 namespace ModuleBase {
 template <typename T>
 struct GetTypeThrust {
@@ -42,16 +34,6 @@ struct GetTypeThrust<std::complex<double>> {
 
 static cublasHandle_t cublas_handle = nullptr;
 
-static inline
-void xdot_wrapper(const int &n, const float * x, const int &incx, const float * y, const int &incy, float &result) {
-    cublasErrcheck(cublasSdot(cublas_handle, n, x, incx, y, incy, &result));
-}
-
-static inline
-void xdot_wrapper(const int &n, const double * x, const int &incx, const double * y, const int &incy, double &result) {
-    cublasErrcheck(cublasDdot(cublas_handle, n, x, incx, y, incy, &result));
-}
-
 void createGpuBlasHandle(){
     if (cublas_handle == nullptr) {
         cublasErrcheck(cublasCreate(&cublas_handle));
diff --git a/source/module_base/kernels/cuda/math_kernel_op_vec.cu b/source/module_base/kernels/cuda/math_kernel_op_vec.cu
@@ -1,16 +1,37 @@
 #include "module_base/kernels/math_kernel_op.h"
 
+#include <base/macros/macros.h>
 #include <thrust/complex.h>
 
+template <>
+struct GetTypeReal<thrust::complex<float>> {
+    using type = float; /**< The return type specialization for std::complex<double>. */
+};
+template <>
+struct GetTypeReal<thrust::complex<double>> {
+    using type = double; /**< The return type specialization for std::complex<double>. */
+};
 namespace ModuleBase
 {
+const int thread_per_block = 256;
+static cublasHandle_t cublas_handle = nullptr;
+
+static inline
+void xdot_wrapper(const int &n, const float * x, const int &incx, const float * y, const int &incy, float &result) {
+    cublasErrcheck(cublasSdot(cublas_handle, n, x, incx, y, incy, &result));
+}
+
+static inline
+void xdot_wrapper(const int &n, const double * x, const int &incx, const double * y, const int &incy, double &result) {
+    cublasErrcheck(cublasDdot(cublas_handle, n, x, incx, y, incy, &result));
+}
 
 // Define the CUDA kernel:
-template <typename FPTYPE>
+template <typename T>
 __global__ void vector_mul_real_kernel(const int size,
-                                       thrust::complex<FPTYPE>* result,
-                                       const thrust::complex<FPTYPE>* vector,
-                                       const FPTYPE constant)
+                                       T* result,
+                                       const T* vector,
+                                       const typename GetTypeReal<T>::type constant)
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
     if (i < size)
@@ -87,6 +108,20 @@ void scal_op<double, base_device::DEVICE_GPU>::operator()(const int& N,
 }
 
 // vector operator: result[i] = vector[i] * constant
+template <>
+void vector_mul_real_op<double, base_device::DEVICE_GPU>::operator()(const int dim,
+                                                                     double* result,
+                                                                     const double* vector,
+                                                                     const double constant)
+{
+    // In small cases, 1024 threads per block will only utilize 17 blocks, much less than 40
+    int thread = thread_per_block;
+    int block = (dim + thread - 1) / thread;
+    vector_mul_real_kernel<double><<<block, thread>>>(dim, result, vector, constant);
+
+    cudaCheckOnDebug();
+}
+
 template <typename FPTYPE>
 inline void vector_mul_real_wrapper(const int dim,
                                     std::complex<FPTYPE>* result,
@@ -98,7 +133,7 @@ inline void vector_mul_real_wrapper(const int dim,
 
     int thread = thread_per_block;
     int block = (dim + thread - 1) / thread;
-    vector_mul_real_kernel<FPTYPE><<<block, thread>>>(dim, result_tmp, vector_tmp, constant);
+    vector_mul_real_kernel<thrust::complex<FPTYPE>><<<block, thread>>>(dim, result_tmp, vector_tmp, constant);
 
     cudaCheckOnDebug();
 }
@@ -326,4 +361,25 @@ double dot_real_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(co
     return dot_complex_wrapper(dim, psi_L, psi_R, reduce);
 }
 
+// Explicitly instantiate functors for the types of functor registered.
+template struct vector_mul_real_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct vector_mul_real_op<double, base_device::DEVICE_GPU>;
+template struct vector_mul_real_op<std::complex<double>, base_device::DEVICE_GPU>;
+
+template struct vector_mul_vector_op<float, base_device::DEVICE_GPU>;
+template struct vector_mul_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct vector_mul_vector_op<double, base_device::DEVICE_GPU>;
+template struct vector_mul_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
+template struct vector_div_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct vector_div_vector_op<double, base_device::DEVICE_GPU>;
+template struct vector_div_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
+
+template struct constantvector_addORsub_constantVector_op<float, base_device::DEVICE_GPU>;
+template struct constantvector_addORsub_constantVector_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct constantvector_addORsub_constantVector_op<double, base_device::DEVICE_GPU>;
+template struct constantvector_addORsub_constantVector_op<std::complex<double>, base_device::DEVICE_GPU>;
+
+template struct dot_real_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct dot_real_op<double, base_device::DEVICE_GPU>;
+template struct dot_real_op<std::complex<double>, base_device::DEVICE_GPU>;
 } // namespace ModuleBase
diff --git a/source/module_base/kernels/math_kernel_op.h b/source/module_base/kernels/math_kernel_op.h
@@ -66,9 +66,10 @@ template <typename FPTYPE, typename Device> struct scal_op {
                   const int &incx);
 };
 
-// vector operator: result[i] = vector[i] * constant
-template <typename FPTYPE, typename Device> struct vector_mul_real_op {
-  /// @brief result[i] = vector[i] * constant, where vector is complex number and constant is real number
+template <typename T, typename Device> struct vector_mul_real_op {
+  using Real = typename GetTypeReal<T>::type;
+  /// @brief result[i] = vector[i] * constant, where vector is complex number and constant is real number。
+  ///        It is different from the scal_op, which is used to multiply a complex number by a complex number.
   ///
   /// Input Parameters
   /// \param dim : array size
@@ -78,8 +79,7 @@ template <typename FPTYPE, typename Device> struct vector_mul_real_op {
   /// Output Parameters
   /// \param result : output array
   /// \note Use mulitple instead of divide. It is faster.
-  void operator()(const int dim, std::complex<FPTYPE> *result, const std::complex<FPTYPE> *vector,
-                  const FPTYPE constant);
+  void operator()(const int dim, T* result, const T* vector, const Real constant);
 };
 
 // vector operator: result[i] = vector1[i](complex) * vector2[i](not complex)
@@ -293,13 +293,11 @@ template <typename T> struct dot_real_op<T, base_device::DEVICE_GPU> {
 };
 
 // vector operator: result[i] = vector[i] / constant
-template <typename FPTYPE>
-struct vector_mul_real_op<FPTYPE, base_device::DEVICE_GPU>
+template <typename T>
+struct vector_mul_real_op<T, base_device::DEVICE_GPU>
 {
-    void operator()(const int dim,
-                    std::complex<FPTYPE>* result,
-                    const std::complex<FPTYPE>* vector,
-                    const FPTYPE constant);
+  using Real = typename GetTypeReal<T>::type;
+  void operator()(const int dim, T* result, const T* vector, const Real constant);
 };
 
 // vector operator: result[i] = vector1[i](complex) * vector2[i](not complex)
diff --git a/source/module_base/kernels/math_kernel_op_vec.cpp b/source/module_base/kernels/math_kernel_op_vec.cpp
@@ -15,13 +15,11 @@ struct scal_op<FPTYPE, base_device::DEVICE_CPU>
     }
 };
 
-template <typename FPTYPE>
-struct vector_mul_real_op<FPTYPE, base_device::DEVICE_CPU>
+template <typename T>
+struct vector_mul_real_op<T, base_device::DEVICE_CPU>
 {
-    void operator()(const int dim,
-                    std::complex<FPTYPE>* result,
-                    const std::complex<FPTYPE>* vector,
-                    const FPTYPE constant)
+    using Real = typename GetTypeReal<T>::type;
+    void operator()(const int dim, T* result, const T* vector, const Real constant)
     {
 #ifdef _OPENMP
 #pragma omp parallel for schedule(static, 4096 / sizeof(Real))
@@ -153,8 +151,9 @@ struct dot_real_op<std::complex<FPTYPE>, base_device::DEVICE_CPU>
 template struct scal_op<float, base_device::DEVICE_CPU>;
 template struct scal_op<double, base_device::DEVICE_CPU>;
 
-template struct vector_mul_real_op<float, base_device::DEVICE_CPU>;
+template struct vector_mul_real_op<std::complex<float>, base_device::DEVICE_CPU>;
 template struct vector_mul_real_op<double, base_device::DEVICE_CPU>;
+template struct vector_mul_real_op<std::complex<double>, base_device::DEVICE_CPU>;
 
 template struct vector_mul_vector_op<std::complex<float>, base_device::DEVICE_CPU>;
 template struct vector_mul_vector_op<double, base_device::DEVICE_CPU>;
diff --git a/source/module_base/kernels/rocm/math_kernel_op.hip.cu b/source/module_base/kernels/rocm/math_kernel_op.hip.cu
@@ -39,16 +39,6 @@ struct GetTypeThrust<std::complex<double>> {
 
 static hipblasHandle_t cublas_handle = nullptr;
 
-static inline
-void xdot_wrapper(const int &n, const float * x, const int &incx, const float * y, const int &incy, float &result) {
-    hipblasErrcheck(hipblasSdot(cublas_handle, n, x, incx, y, incy, &result));
-}
-
-static inline
-void xdot_wrapper(const int &n, const double * x, const int &incx, const double * y, const int &incy, double &result) {
-    hipblasErrcheck(hipblasDdot(cublas_handle, n, x, incx, y, incy, &result));
-}
-
 void createGpuBlasHandle(){
     if (cublas_handle == nullptr) {
         hipblasErrcheck(hipblasCreate(&cublas_handle));
diff --git a/source/module_base/kernels/rocm/math_kernel_op_vec.hip.cu b/source/module_base/kernels/rocm/math_kernel_op_vec.hip.cu
@@ -1,15 +1,35 @@
 #include "module_base/kernels/math_kernel_op.h"
 
+#include <base/macros/macros.h>
 #include <thrust/complex.h>
-
+template <>
+struct GetTypeReal<thrust::complex<float>> {
+    using type = float; /**< The return type specialization for std::complex<double>. */
+};
+template <>
+struct GetTypeReal<thrust::complex<double>> {
+    using type = double; /**< The return type specialization for std::complex<double>. */
+};
 namespace ModuleBase
 {
+
+static hipblasHandle_t cublas_handle = nullptr;
+static inline
+void xdot_wrapper(const int &n, const float * x, const int &incx, const float * y, const int &incy, float &result) {
+    hipblasErrcheck(hipblasSdot(cublas_handle, n, x, incx, y, incy, &result));
+}
+
+static inline
+void xdot_wrapper(const int &n, const double * x, const int &incx, const double * y, const int &incy, double &result) {
+    hipblasErrcheck(hipblasDdot(cublas_handle, n, x, incx, y, incy, &result));
+}
+
 // Define the CUDA kernel:
-template <typename FPTYPE>
+template <typename T>
 __launch_bounds__(1024) __global__ void vector_mul_real_kernel(const int size,
-                                                               thrust::complex<FPTYPE>* result,
-                                                               const thrust::complex<FPTYPE>* vector,
-                                                               FPTYPE constant)
+                                                               T* result,
+                                                               const T* vector,
+                                                               const typename GetTypeReal<T>::type constant)
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
     if (i < size)
@@ -86,6 +106,26 @@ void scal_op<double, base_device::DEVICE_GPU>::operator()(const int& N,
 }
 
 // vector operator: result[i] = vector[i] * constant
+template <>
+void vector_mul_real_op<double, base_device::DEVICE_GPU>::operator()(const int dim,
+                                                                     double* result,
+                                                                     const double* vector,
+                                                                     const double constant)
+{
+    int thread = 1024;
+    int block = (dim + thread - 1) / thread;
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_div_constant_kernel<double>),
+                       dim3(block),
+                       dim3(thread),
+                       0,
+                       0,
+                       dim,
+                       result,
+                       vector,
+                       constant);
+
+    hipCheckOnDebug();
+}
 template <typename FPTYPE>
 inline void vector_mul_real_wrapper(const int dim,
                                     std::complex<FPTYPE>* result,
@@ -96,7 +136,7 @@ inline void vector_mul_real_wrapper(const int dim,
     const thrust::complex<FPTYPE>* vector_tmp = reinterpret_cast<const thrust::complex<FPTYPE>*>(vector);
     int thread = 1024;
     int block = (dim + thread - 1) / thread;
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_mul_real_kernel<FPTYPE>),
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_mul_real_kernel<thrust::complex<FPTYPE>>),
                        dim3(block),
                        dim3(thread),
                        0,
@@ -378,4 +418,26 @@ double dot_real_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(co
 {
     return dot_complex_wrapper(dim, psi_L, psi_R, reduce);
 }
+
+// Explicitly instantiate functors for the types of functor registered.
+template struct vector_mul_real_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct vector_mul_real_op<double, base_device::DEVICE_GPU>;
+template struct vector_mul_real_op<std::complex<double>, base_device::DEVICE_GPU>;
+
+template struct vector_mul_vector_op<float, base_device::DEVICE_GPU>;
+template struct vector_mul_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct vector_mul_vector_op<double, base_device::DEVICE_GPU>;
+template struct vector_mul_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
+template struct vector_div_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct vector_div_vector_op<double, base_device::DEVICE_GPU>;
+template struct vector_div_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
+
+template struct constantvector_addORsub_constantVector_op<float, base_device::DEVICE_GPU>;
+template struct constantvector_addORsub_constantVector_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct constantvector_addORsub_constantVector_op<double, base_device::DEVICE_GPU>;
+template struct constantvector_addORsub_constantVector_op<std::complex<double>, base_device::DEVICE_GPU>;
+
+template struct dot_real_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct dot_real_op<double, base_device::DEVICE_GPU>;
+template struct dot_real_op<std::complex<double>, base_device::DEVICE_GPU>;
 } // namespace ModuleBase
diff --git a/source/module_base/kernels/test/CMakeLists.txt b/source/module_base/kernels/test/CMakeLists.txt
@@ -3,5 +3,5 @@ remove_definitions(-D__MPI)
 AddTest(
     TARGET Base_Kernels_UTs
     LIBS parameter ${math_libs} base device 
-    SOURCES math_op_test.cpp math_kernel_test.cpp
+    SOURCES math_ylm_op_test.cpp math_kernel_test.cpp
 )
diff --git a/source/module_base/kernels/test/math_ylm_op_test.cpp b/source/module_base/kernels/test/math_ylm_op_test.cpp
@@ -1,4 +1,4 @@
-#include "module_base/kernels/math_op.h"
+#include "module_base/kernels/math_ylm_op.h"
 
 #include "module_base/module_device/memory_op.h"
 
diff --git a/source/module_hsolver/diago_bpcg.h b/source/module_hsolver/diago_bpcg.h
@@ -7,6 +7,7 @@
 #include "module_base/para_gemm.h"
 #include "module_hamilt_general/hamilt.h"
 #include "module_hamilt_pw/hamilt_pwdft/structure_factor.h"
+#include "module_hsolver/kernels/bpcg_kernel_op.h"
 #include "module_hsolver/kernels/dngvd_op.h"
 #include "module_hsolver/para_linear_transform.h"
 
diff --git a/source/module_hsolver/diago_cg.cpp b/source/module_hsolver/diago_cg.cpp
diff --git a/source/module_hsolver/diago_dav_subspace.cpp b/source/module_hsolver/diago_dav_subspace.cpp
diff --git a/source/module_hsolver/diago_david.cpp b/source/module_hsolver/diago_david.cpp
diff --git a/source/module_hsolver/kernels/bpcg_kernel_op.cpp b/source/module_hsolver/kernels/bpcg_kernel_op.cpp
diff --git a/source/module_hsolver/kernels/cuda/bpcg_kernel_op.cu b/source/module_hsolver/kernels/cuda/bpcg_kernel_op.cu
diff --git a/source/module_hsolver/kernels/rocm/bpcg_kernel_op.hip.cu b/source/module_hsolver/kernels/rocm/bpcg_kernel_op.hip.cu