deepmodeling · mohanchen · Sep 9, 2025 · Sep 5, 2025 · Sep 5, 2025 · Sep 8, 2025
diff --git a/source/source_base/kernels/cuda/math_kernel_op.cu b/source/source_base/kernels/cuda/math_kernel_op.cu
@@ -133,6 +133,14 @@ __global__ void matrix_copy_kernel(const int n1, const int n2, const T* A, const
     }
 }
 
+template <typename T, typename Real>
+__global__ void matrix_multiply_vector_kernel(const int m, const int n, T *a, const int lda, const Real *b, const Real alpha, T *c, const int ldc){
+    int row = blockIdx.x * blockDim.x + threadIdx.x;
+    int col = blockIdx.y * blockDim.y + threadIdx.y;
+    if (col >= n || row >= m) return;
+    c[col * ldc + row] = a[col * lda + row] * b[col] * alpha;
+}
+
 cublasOperation_t judge_trans_op(bool is_complex, const char& trans, const char* name)
 {
     if (trans == 'N')
@@ -147,7 +155,7 @@ cublasOperation_t judge_trans_op(bool is_complex, const char& trans, const char*
     {
         return CUBLAS_OP_C;
     }
-    else 
+    else
     {
         ModuleBase::WARNING_QUIT(name, std::string("Unknown trans type ") + trans + std::string(" !"));
     }
@@ -438,10 +446,44 @@ void matrixCopy<std::complex<double>, base_device::DEVICE_GPU>::operator()(const
     cudaCheckOnDebug();
 }
 
+template <>
+void matrix_mul_vector_op<double, base_device::DEVICE_GPU>::operator()(const int &m, const int &n,
+                  double *a, const int &lda, const double *b, const double alpha, double *c, const int &ldc){
+    dim3 thread(16, 16, 1);
+    dim3 block((m + thread.x - 1) / thread.x, (n + thread.y - 1) / thread.y, 1);
+    matrix_multiply_vector_kernel<double, double> <<<block, thread >>>(m, n, a, lda,
+    b, alpha, c, ldc);
+    cudaCheckOnDebug();
+}
+
+template <>
+void matrix_mul_vector_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int &m, const int &n,
+                  std::complex<float> *a, const int &lda, const float *b, const float alpha, std::complex<float> *c, const int &ldc){
+    dim3 thread(16, 16, 1);
+    dim3 block((m + thread.x - 1) / thread.x, (n + thread.y - 1) / thread.y, 1);
+    matrix_multiply_vector_kernel<thrust::complex<float>, float> <<<block, thread >>>(m, n, reinterpret_cast<thrust::complex<float>*>(a), lda,
+    b, alpha, reinterpret_cast<thrust::complex<float>*>(c), ldc);
+    cudaCheckOnDebug();
+}
+
+template <>
+void matrix_mul_vector_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(const int &m, const int &n,
+                  std::complex<double> *a, const int &lda, const double *b, const double alpha, std::complex<double> *c, const int &ldc)
+{
+    dim3 thread(16, 16, 1);
+    dim3 block((m + thread.x - 1) / thread.x, (n + thread.y - 1) / thread.y, 1);
+    matrix_multiply_vector_kernel<thrust::complex<double>, double> <<<block, thread >>>(m, n, reinterpret_cast<thrust::complex<double>*>(a), lda,
+    b, alpha, reinterpret_cast<thrust::complex<double>*>(c), ldc);
+    cudaCheckOnDebug();
+}
 
 // Explicitly instantiate functors for the types of functor registered.
 
 template struct matrixCopy<std::complex<float>, base_device::DEVICE_GPU>;
 template struct matrixCopy<double, base_device::DEVICE_GPU>;
 template struct matrixCopy<std::complex<double>, base_device::DEVICE_GPU>;
+
+template struct matrix_mul_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct matrix_mul_vector_op<double, base_device::DEVICE_GPU>;
+template struct matrix_mul_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
 }  // namespace ModuleBase
diff --git a/source/source_base/kernels/math_kernel_op.cpp b/source/source_base/kernels/math_kernel_op.cpp
@@ -119,12 +119,35 @@ struct matrixCopy<T, base_device::DEVICE_CPU>
     }
 };
 
+template <typename T>
+struct matrix_mul_vector_op<T, base_device::DEVICE_CPU> {
+    using Real = typename GetTypeReal<T>::type;
+    void operator()(const int& m, const int &n,
+                  T *a,
+                  const int &lda,
+                  const Real *b,
+                  const Real alpha,
+                  T *c,
+                  const int &ldc){
+#ifdef _OPENMP
+#pragma omp parallel for collapse(2) schedule(static, 8192 / sizeof(T))
+#endif
+        for (int j = 0; j < n; j++){
+            for (int i = 0; i < m; i++){
+                c[j * ldc + i] = a[j * lda + i] * b[j] * alpha;
+            }
+        }
+
+    }
+};
+
 template struct gemv_op<std::complex<float>, base_device::DEVICE_CPU>;
 template struct gemv_op<float, base_device::DEVICE_CPU>;
 template struct gemm_op<std::complex<float>, base_device::DEVICE_CPU>;
 template struct gemm_op<float, base_device::DEVICE_CPU>;
 template struct matrixTranspose_op<std::complex<float>, base_device::DEVICE_CPU>;
 template struct matrixCopy<std::complex<float>, base_device::DEVICE_CPU>;
+template struct matrix_mul_vector_op<std::complex<float>, base_device::DEVICE_CPU>;
 
 template struct gemv_op<std::complex<double>, base_device::DEVICE_CPU>;
 template struct gemv_op<double, base_device::DEVICE_CPU>;
@@ -133,6 +156,8 @@ template struct gemm_op<double, base_device::DEVICE_CPU>;
 template struct matrixTranspose_op<std::complex<double>, base_device::DEVICE_CPU>;
 template struct matrixCopy<double, base_device::DEVICE_CPU>;
 template struct matrixCopy<std::complex<double>, base_device::DEVICE_CPU>;
+template struct matrix_mul_vector_op<double, base_device::DEVICE_CPU>;
+template struct matrix_mul_vector_op<std::complex<double>, base_device::DEVICE_CPU>;
 
 #ifdef __LCAO
 template struct matrixTranspose_op<double, base_device::DEVICE_CPU>;

diff --git a/source/source_base/kernels/math_kernel_op.h b/source/source_base/kernels/math_kernel_op.h
@@ -104,7 +104,7 @@ template <typename T, typename Device> struct vector_div_constant_op {
   ///
   /// Input Parameters
   /// \param dim : array size
-  /// \param vector : input array 
+  /// \param vector : input array
   /// \param constant : input constant
   ///
   /// Output Parameters
@@ -298,6 +298,31 @@ template <typename T, typename Device> struct matrixCopy {
   void operator()(const int& n1, const int& n2, const T* A, const int& LDA, T* B, const int& LDB);
 };
 
+template <typename T, typename Device>
+struct matrix_mul_vector_op {
+    using Real = typename GetTypeReal<T>::type;
+  /// @brief a * b * beta by each column
+  ///
+  /// Input Parameters
+  /// \param m : row number
+  /// \param n : column number
+  /// \param a : input matrix
+  /// \param lda : leading dimension of matrix a
+  /// \param b : input vector
+  /// \param alpha : factor
+  /// \param ldc : leading dimension of matrix c
+  ///
+  /// Output Parameters
+  /// \param c : output matrix
+  void operator()(const int &m, const int &n,
+                  T *a,
+                  const int &lda,
+                  const Real *b,
+                  const Real alpha,
+                  T *c,
+                  const int &ldc);
+};
+
 template <typename T, typename Device>
 struct apply_eigenvalues_op {
     using Real = typename GetTypeReal<T>::type;
@@ -314,7 +339,7 @@ struct precondition_op {
                    T* psi_iter,
                    const int& nbase,
                    const int& notconv,
-                   const Real* precondition,  
+                   const Real* precondition,
                    const Real* eigenvalues);
 };
 
@@ -393,6 +418,17 @@ template <typename T> struct matrixCopy<T, base_device::DEVICE_GPU> {
                     const int& LDB);
 };
 
+template <typename T> struct matrix_mul_vector_op<T, base_device::DEVICE_GPU> {
+  using Real = typename GetTypeReal<T>::type;
+  void operator()(const int &m, const int &n,
+                  T *a,
+                  const int &lda,
+                  const Real *b,
+                  const Real alpha,
+                  T *c,
+                  const int &ldc);
+};
+
 void createGpuBlasHandle();
 void destoryBLAShandle();
 

diff --git a/source/source_base/kernels/rocm/math_kernel_op.hip.cu b/source/source_base/kernels/rocm/math_kernel_op.hip.cu
@@ -145,6 +145,15 @@ __launch_bounds__(1024) __global__
     }
 }
 
+template <typename T, typename Real>
+__launch_bounds__(1024) __global__
+void matrix_multiply_vector_kernel(const int m, const int n, T *a, const int lda, const Real *b, const Real alpha, T *c, const int ldc){
+    int row = blockIdx.x * blockDim.x + threadIdx.x;
+    int col = blockIdx.y * blockDim.y + threadIdx.y;
+    if (col >= n || row >= m) return;
+    c[col * ldc + row] = a[col * lda + row] * b[col] * alpha;
+}
+
 hipblasOperation_t judge_trans_op(bool is_complex, const char& trans, const char* name)
 {
     if (trans == 'N')
@@ -159,7 +168,7 @@ hipblasOperation_t judge_trans_op(bool is_complex, const char& trans, const char
     {
         return HIPBLAS_OP_C;
     }
-    else 
+    else
     {
         ModuleBase::WARNING_QUIT(name, std::string("Unknown trans type ") + trans + std::string(" !"));
     }
@@ -437,7 +446,38 @@ void matrixCopy<std::complex<double>, base_device::DEVICE_GPU>::operator()(const
     hipCheckOnDebug();
 }
 
+template <>
+void matrix_mul_vector_op<double, base_device::DEVICE_GPU>::operator()(const int &m, const int &n,
+                  double *a, const int &lda, const double *b, const double alpha, double *c, const int &ldc){
+    dim3 thread(16, 16, 1);
+    dim3 block((m + thread.x - 1) / thread.x, (n + thread.y - 1) / thread.y, 1);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(matrix_multiply_vector_kernel<double, double>), dim3(block, thread),
+        m, n, a, lda, b, alpha, c, ldc);
+    hipCheckOnDebug();
+}
 
+template <>
+void matrix_mul_vector_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int &m, const int &n,
+                  std::complex<float> *a, const int &lda, const float *b, const float alpha, std::complex<float> *c, const int &ldc){
+    dim3 thread(16, 16, 1);
+    dim3 block((m + thread.x - 1) / thread.x, (n + thread.y - 1) / thread.y, 1);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(matrix_multiply_vector_kernel<thrust::complex<float>, float>), dim3(block, thread),
+        m, n, reinterpret_cast<thrust::complex<float>*>(a), lda,
+        b, alpha, reinterpret_cast<thrust::complex<float>*>(c), ldc);
+    hipCheckOnDebug();
+}
+
+template <>
+void matrix_mul_vector_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(const int &m, const int &n,
+                  std::complex<double> *a, const int &lda, const double *b, const double alpha, std::complex<double> *c, const int &ldc)
+{
+    dim3 thread(16, 16, 1);
+    dim3 block((m + thread.x - 1) / thread.x, (n + thread.y - 1) / thread.y, 1);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(matrix_multiply_vector_kernel<thrust::complex<double>, double>), dim3(block, thread),
+        m, n, reinterpret_cast<thrust::complex<double>*>(a), lda,
+        b, alpha, reinterpret_cast<thrust::complex<double>*>(c), ldc);
+    hipCheckOnDebug();
+}
 
 // Explicitly instantiate functors for the types of functor registered.
 template struct matrixCopy<double, base_device::DEVICE_GPU>;

diff --git a/source/source_base/module_device/cuda/memory_op.cu b/source/source_base/module_device/cuda/memory_op.cu
@@ -85,6 +85,16 @@ void set_memory_op<FPTYPE, base_device::DEVICE_GPU>::operator()(FPTYPE* arr,
     cudaErrcheck(cudaMemset(arr, var, sizeof(FPTYPE) * size));
 }
 
+template <typename FPTYPE>
+void set_memory_2d_op<FPTYPE, base_device::DEVICE_GPU>::operator()(FPTYPE* arr,
+                                                                   const size_t pitch,
+                                                                   const int var,
+                                                                   const size_t width,
+                                                                   const size_t height)
+{
+    cudaErrcheck(cudaMemset2D(arr, sizeof(FPTYPE) * pitch , var, sizeof(FPTYPE) * width, height));
+}
+
 template <typename FPTYPE>
 void synchronize_memory_op<FPTYPE, base_device::DEVICE_CPU, base_device::DEVICE_GPU>::operator()(
     FPTYPE* arr_out,
@@ -112,6 +122,42 @@ void synchronize_memory_op<FPTYPE, base_device::DEVICE_GPU, base_device::DEVICE_
     cudaErrcheck(cudaMemcpy(arr_out, arr_in, sizeof(FPTYPE) * size, cudaMemcpyDeviceToDevice));
 }
 
+template <typename FPTYPE>
+void synchronize_memory_2d_op<FPTYPE, base_device::DEVICE_CPU, base_device::DEVICE_GPU>::operator()(
+    FPTYPE* arr_out,
+    const size_t dpitch,
+    const FPTYPE* arr_in,
+    const size_t spitch,
+    const size_t width,
+    const size_t height)
+{
+    cudaErrcheck(cudaMemcpy2D(arr_out, dpitch * sizeof(FPTYPE), arr_in, spitch * sizeof(FPTYPE), width * sizeof(FPTYPE), height, cudaMemcpyDeviceToHost));
+}
+
+template <typename FPTYPE>
+void synchronize_memory_2d_op<FPTYPE, base_device::DEVICE_GPU, base_device::DEVICE_CPU>::operator()(
+    FPTYPE* arr_out,
+    const size_t dpitch,
+    const FPTYPE* arr_in,
+    const size_t spitch,
+    const size_t width,
+    const size_t height)
+{
+    cudaErrcheck(cudaMemcpy2D(arr_out, dpitch * sizeof(FPTYPE), arr_in, spitch * sizeof(FPTYPE), width * sizeof(FPTYPE), height, cudaMemcpyHostToDevice));
+}
+
+template <typename FPTYPE>
+void synchronize_memory_2d_op<FPTYPE, base_device::DEVICE_GPU, base_device::DEVICE_GPU>::operator()(
+    FPTYPE* arr_out,
+    const size_t dpitch,
+    const FPTYPE* arr_in,
+    const size_t spitch,
+    const size_t width,
+    const size_t height)
+{
+    cudaErrcheck(cudaMemcpy2D(arr_out, dpitch * sizeof(FPTYPE), arr_in, spitch * sizeof(FPTYPE), width * sizeof(FPTYPE), height, cudaMemcpyDeviceToDevice));
+}
+
 template <typename FPTYPE_out, typename FPTYPE_in>
 struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_GPU, base_device::DEVICE_GPU>
 {
@@ -196,6 +242,12 @@ template struct set_memory_op<double, base_device::DEVICE_GPU>;
 template struct set_memory_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct set_memory_op<std::complex<double>, base_device::DEVICE_GPU>;
 
+template struct set_memory_2d_op<int, base_device::DEVICE_GPU>;
+template struct set_memory_2d_op<float, base_device::DEVICE_GPU>;
+template struct set_memory_2d_op<double, base_device::DEVICE_GPU>;
+template struct set_memory_2d_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct set_memory_2d_op<std::complex<double>, base_device::DEVICE_GPU>;
+
 template struct synchronize_memory_op<int, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
 template struct synchronize_memory_op<int, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
 template struct synchronize_memory_op<int, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
@@ -212,6 +264,22 @@ template struct synchronize_memory_op<std::complex<double>, base_device::DEVICE_
 template struct synchronize_memory_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
 template struct synchronize_memory_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
 
+template struct synchronize_memory_2d_op<int, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
+template struct synchronize_memory_2d_op<int, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
+template struct synchronize_memory_2d_op<int, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
+template struct synchronize_memory_2d_op<float, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
+template struct synchronize_memory_2d_op<float, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
+template struct synchronize_memory_2d_op<float, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
+template struct synchronize_memory_2d_op<double, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
+template struct synchronize_memory_2d_op<double, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
+template struct synchronize_memory_2d_op<double, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
+template struct synchronize_memory_2d_op<std::complex<float>, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
+template struct synchronize_memory_2d_op<std::complex<float>, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
+template struct synchronize_memory_2d_op<std::complex<float>, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
+template struct synchronize_memory_2d_op<std::complex<double>, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
+template struct synchronize_memory_2d_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
+template struct synchronize_memory_2d_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
+
 template struct cast_memory_op<float, float, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
 template struct cast_memory_op<double, double, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
 template struct cast_memory_op<float, double, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;