Finish CUDA kernel

Critsium-xy · Critsium-xy · commit 6060b43d3c4d · 2025-01-13T19:38:13.000+08:00
diff --git a/source/module_base/blas_connector.cpp b/source/module_base/blas_connector.cpp
@@ -13,6 +13,7 @@
 #include <thrust/execution_policy.h>
 #include <thrust/inner_product.h>
 #include "module_base/tool_quit.h"
+#include "module_base/kernels/cuda/math_op.cu"
 
 #include "cublas_v2.h"
 
@@ -668,6 +669,11 @@ void vector_mul_vector(const int& dim, T* result, const T* vector1, const T* vec
             result[i] = vector1[i] * vector2[i];
         }
 	}
+	else if (device_type == base_device::AbacusDevice_t::GpuDevice){
+#ifdef __CUDA
+		vector_mul_vector_complex_wrapper(d, dim, result, vector1, vector2);
+#endif
+	}
 }
 
 
@@ -683,4 +689,9 @@ void vector_div_vector(const int& dim, T* result, const T* vector1, const T* vec
             result[i] = vector1[i] / vector2[i];
         }
 	}
+	else if (device_type == base_device::AbacusDevice_t::GpuDevice){
+#ifdef __CUDA
+		vector_div_vector_complex_wrapper(d, dim, result, vector1, vector2);
+#endif
+	}
 }
diff --git a/source/module_base/kernels/cuda/math_op.cu b/source/module_base/kernels/cuda/math_op.cu
@@ -154,4 +154,69 @@ void cal_ylm_real_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_dev
 template struct cal_ylm_real_op<float, base_device::DEVICE_GPU>;
 template struct cal_ylm_real_op<double, base_device::DEVICE_GPU>;
 
+
+// The next are kernels for new blas_connector
+
+
+template <typename T>
+__global__ void vector_mul_vector_kernel(
+    const int size,
+    T* result,
+    const T* vector1,
+    const typename GetTypeReal<T>::type* vector2)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < size)
+    {
+        result[i] = vector1[i] * vector2[i];
+    }
+}
+
+template <typename T>
+__global__ void vector_div_vector_kernel(
+    const int size,
+    T* result,
+    const T* vector1,
+    const typename GetTypeReal<T>::type* vector2)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < size)
+    {
+        result[i] = vector1[i] / vector2[i];
+    }
+}
+
+template <typename FPTYPE>
+inline void vector_div_constant_complex_wrapper(const base_device::DEVICE_GPU* d,
+                                                const int dim,
+                                                std::complex<FPTYPE>* result,
+                                                const std::complex<FPTYPE>* vector,
+                                                const FPTYPE constant)
+{
+    thrust::complex<FPTYPE>* result_tmp = reinterpret_cast<thrust::complex<FPTYPE>*>(result);
+    const thrust::complex<FPTYPE>* vector_tmp = reinterpret_cast<const thrust::complex<FPTYPE>*>(vector);
+
+    int thread = thread_per_block;
+    int block = (dim + thread - 1) / thread;
+    vector_div_constant_kernel<thrust::complex<FPTYPE>> <<<block, thread >>> (dim, result_tmp, vector_tmp, constant);
+
+    cudaCheckOnDebug();
+}
+
+template <typename FPTYPE>
+inline void vector_mul_vector_complex_wrapper(const base_device::DEVICE_GPU* d,
+                                              const int& dim,
+                                              std::complex<FPTYPE>* result,
+                                              const std::complex<FPTYPE>* vector1,
+                                              const FPTYPE* vector2)
+{
+    thrust::complex<FPTYPE>* result_tmp = reinterpret_cast<thrust::complex<FPTYPE>*>(result);
+    const thrust::complex<FPTYPE>* vector1_tmp = reinterpret_cast<const thrust::complex<FPTYPE>*>(vector1);
+    int thread = thread_per_block;
+    int block = (dim + thread - 1) / thread;
+    vector_mul_vector_kernel<thrust::complex<FPTYPE>> <<<block, thread >>> (dim, result_tmp, vector1_tmp, vector2);
+
+    cudaCheckOnDebug();
+}
+
 }  // namespace ModuleBase

Original file line number	Diff line number	Diff line change
`@@ -13,6 +13,7 @@`
`13`	`13`	`#include <thrust/execution_policy.h>`
`14`	`14`	`#include <thrust/inner_product.h>`
`15`	`15`	`#include "module_base/tool_quit.h"`
	`16`	`+#include "module_base/kernels/cuda/math_op.cu"`
`16`	`17`
`17`	`18`	`#include "cublas_v2.h"`
`18`	`19`
`@@ -668,6 +669,11 @@ void vector_mul_vector(const int& dim, T* result, const T* vector1, const T* vec`
`668`	`669`	`result[i] = vector1[i] * vector2[i];`
`669`	`670`	`}`
`670`	`671`	`}`
	`672`	`+ else if (device_type == base_device::AbacusDevice_t::GpuDevice){`
	`673`	`+#ifdef __CUDA`
	`674`	`+ vector_mul_vector_complex_wrapper(d, dim, result, vector1, vector2);`
	`675`	`+#endif`
	`676`	`+ }`
`671`	`677`	`}`
`672`	`678`
`673`	`679`
`@@ -683,4 +689,9 @@ void vector_div_vector(const int& dim, T* result, const T* vector1, const T* vec`
`683`	`689`	`result[i] = vector1[i] / vector2[i];`
`684`	`690`	`}`
`685`	`691`	`}`
	`692`	`+ else if (device_type == base_device::AbacusDevice_t::GpuDevice){`
	`693`	`+#ifdef __CUDA`
	`694`	`+ vector_div_vector_complex_wrapper(d, dim, result, vector1, vector2);`
	`695`	`+#endif`
	`696`	`+ }`
`686`	`697`	`}`