From a967bd896a368713b8cb5b8d791d7116bded064b Mon Sep 17 00:00:00 2001 From: critsium-xy Date: Fri, 10 Jan 2025 01:07:51 +0800 Subject: [PATCH 01/12] Added some other necessary kernels --- source/module_base/blas_connector.cpp | 81 ++++++++++++++++++++++++++- source/module_base/blas_connector.h | 22 +++++++- 2 files changed, 101 insertions(+), 2 deletions(-) diff --git a/source/module_base/blas_connector.cpp b/source/module_base/blas_connector.cpp index 106eb6c4e8..8d749e4783 100644 --- a/source/module_base/blas_connector.cpp +++ b/source/module_base/blas_connector.cpp @@ -1,4 +1,5 @@ #include "blas_connector.h" +#include "macros.h" #ifdef __DSP #include "module_base/kernels/dsp/dsp_connector.h" @@ -652,4 +653,82 @@ void BlasConnector::copy(const long n, const std::complex *a, const int if (device_type == base_device::AbacusDevice_t::CpuDevice) { zcopy_(&n, a, &incx, b, &incy); } -} \ No newline at end of file +} + +template +void vector_mul_vector(const int& dim, T* result, const T* vector1, const T* vector2, base_device::AbacusDevice_t device_type){ + using Real = typename GetTypeReal::type; + if (device_type == base_device::AbacusDevice_t::CpuDevice) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static, 4096 / sizeof(Real)) +#endif + for (int i = 0; i < dim; i++) + { + result[i] = vector1[i] * vector2[i]; + } + } +} + + +template +void vector_div_vector(const int& dim, T* result, const T* vector1, const T* vector2, base_device::AbacusDevice_t device_type){ + using Real = typename GetTypeReal::type; + if (device_type == base_device::AbacusDevice_t::CpuDevice) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static, 4096 / sizeof(Real)) +#endif + for (int i = 0; i < dim; i++) + { + result[i] = vector1[i] / vector2[i]; + } + } +} + + +void constantvector_addORsub_constantVector_op(const int& dim, float* result, const float* vector1, const float constant1, const float* vector2, const float constant2, base_device::AbacusDevice_t device_type){ + if (device_type == base_device::AbacusDevice_t::CpuDevice){ +#ifdef _OPENMP +#pragma omp parallel for schedule(static, 8192 / sizeof(T)) +#endif + for (int i = 0; i < dim; i++) + { + result[i] = vector1[i] * constant1 + vector2[i] * constant2; + } + } +} + +void constantvector_addORsub_constantVector_op(const int& dim, double* result, const double* vector1, const double constant1, const double* vector2, const double constant2, base_device::AbacusDevice_t device_type){ + if (device_type == base_device::AbacusDevice_t::CpuDevice){ +#ifdef _OPENMP +#pragma omp parallel for schedule(static, 8192 / sizeof(T)) +#endif + for (int i = 0; i < dim; i++) + { + result[i] = vector1[i] * constant1 + vector2[i] * constant2; + } + } +} + +void constantvector_addORsub_constantVector_op(const int& dim, std::complex* result, const std::complex* vector1, const float constant1, const std::complex* vector2, const float constant2, base_device::AbacusDevice_t device_type){ + if (device_type == base_device::AbacusDevice_t::CpuDevice){ +#ifdef _OPENMP +#pragma omp parallel for schedule(static, 8192 / sizeof(T)) +#endif + for (int i = 0; i < dim; i++) + { + result[i] = vector1[i] * constant1 + vector2[i] * constant2; + } + } +} + +void constantvector_addORsub_constantVector_op(const int& dim, std::complex* result, const std::complex* vector1, const double constant1, const std::complex* vector2, const double constant2, base_device::AbacusDevice_t device_type){ + if (device_type == base_device::AbacusDevice_t::CpuDevice){ +#ifdef _OPENMP +#pragma omp parallel for schedule(static, 8192 / sizeof(T)) +#endif + for (int i = 0; i < dim; i++) + { + result[i] = vector1[i] * constant1 + vector2[i] * constant2; + } + } +} \ No newline at end of file diff --git a/source/module_base/blas_connector.h b/source/module_base/blas_connector.h index 7675429520..923f35c87c 100644 --- a/source/module_base/blas_connector.h +++ b/source/module_base/blas_connector.h @@ -3,6 +3,7 @@ #include #include "module_base/module_device/types.h" +#include "macros.h" // These still need to be linked in the header file // Because quite a lot of code will directly use the original cblas kernels. @@ -303,7 +304,26 @@ class BlasConnector static void copy(const long n, const std::complex *a, const int incx, std::complex *b, const int incy, base_device::AbacusDevice_t device_type = base_device::AbacusDevice_t::CpuDevice); - // A is symmetric + // There is some other operators needed, so implemented manually here + template + static + void vector_mul_vector(const int& dim, T* result, const T* vector1, const T* vector2, base_device::AbacusDevice_t device_type = base_device::AbacusDevice_t::CpuDevice); + + template + static + void vector_div_vector(const int& dim, T* result, const T* vector1, const T* vector2, base_device::AbacusDevice_t device_type = base_device::AbacusDevice_t::CpuDevice); + + static + void constantvector_addORsub_constantVector_op(const int& dim, float* result, const float* vector1, const float constant1, const float* vector2, const float constant2, base_device::AbacusDevice_t device_type = base_device::AbacusDevice_t::CpuDevice); + + static + void constantvector_addORsub_constantVector_op(const int& dim, double* result, const double* vector1, const double constant1, const double* vector2, const double constant2, base_device::AbacusDevice_t device_type = base_device::AbacusDevice_t::CpuDevice); + + static + void constantvector_addORsub_constantVector_op(const int& dim, std::complex* result, const std::complex* vector1, const float constant1, const std::complex* vector2, const float constant2, base_device::AbacusDevice_t device_type = base_device::AbacusDevice_t::CpuDevice); + + static + void constantvector_addORsub_constantVector_op(const int& dim, std::complex* result, const std::complex* vector1, const double constant1, const std::complex* vector2, const double constant2, base_device::AbacusDevice_t device_type = base_device::AbacusDevice_t::CpuDevice); }; // If GATHER_INFO is defined, the original function is replaced with a "i" suffix, From 1c68629098a8b55ac4e5e8c23b8446573fcbc000 Mon Sep 17 00:00:00 2001 From: critsium-xy Date: Fri, 10 Jan 2025 14:36:38 +0800 Subject: [PATCH 02/12] Fix compiling bug --- source/module_base/blas_connector.cpp | 51 +-------------------------- source/module_base/blas_connector.h | 12 ------- 2 files changed, 1 insertion(+), 62 deletions(-) diff --git a/source/module_base/blas_connector.cpp b/source/module_base/blas_connector.cpp index 8d749e4783..723256abd2 100644 --- a/source/module_base/blas_connector.cpp +++ b/source/module_base/blas_connector.cpp @@ -682,53 +682,4 @@ void vector_div_vector(const int& dim, T* result, const T* vector1, const T* vec result[i] = vector1[i] / vector2[i]; } } -} - - -void constantvector_addORsub_constantVector_op(const int& dim, float* result, const float* vector1, const float constant1, const float* vector2, const float constant2, base_device::AbacusDevice_t device_type){ - if (device_type == base_device::AbacusDevice_t::CpuDevice){ -#ifdef _OPENMP -#pragma omp parallel for schedule(static, 8192 / sizeof(T)) -#endif - for (int i = 0; i < dim; i++) - { - result[i] = vector1[i] * constant1 + vector2[i] * constant2; - } - } -} - -void constantvector_addORsub_constantVector_op(const int& dim, double* result, const double* vector1, const double constant1, const double* vector2, const double constant2, base_device::AbacusDevice_t device_type){ - if (device_type == base_device::AbacusDevice_t::CpuDevice){ -#ifdef _OPENMP -#pragma omp parallel for schedule(static, 8192 / sizeof(T)) -#endif - for (int i = 0; i < dim; i++) - { - result[i] = vector1[i] * constant1 + vector2[i] * constant2; - } - } -} - -void constantvector_addORsub_constantVector_op(const int& dim, std::complex* result, const std::complex* vector1, const float constant1, const std::complex* vector2, const float constant2, base_device::AbacusDevice_t device_type){ - if (device_type == base_device::AbacusDevice_t::CpuDevice){ -#ifdef _OPENMP -#pragma omp parallel for schedule(static, 8192 / sizeof(T)) -#endif - for (int i = 0; i < dim; i++) - { - result[i] = vector1[i] * constant1 + vector2[i] * constant2; - } - } -} - -void constantvector_addORsub_constantVector_op(const int& dim, std::complex* result, const std::complex* vector1, const double constant1, const std::complex* vector2, const double constant2, base_device::AbacusDevice_t device_type){ - if (device_type == base_device::AbacusDevice_t::CpuDevice){ -#ifdef _OPENMP -#pragma omp parallel for schedule(static, 8192 / sizeof(T)) -#endif - for (int i = 0; i < dim; i++) - { - result[i] = vector1[i] * constant1 + vector2[i] * constant2; - } - } -} \ No newline at end of file +} \ No newline at end of file diff --git a/source/module_base/blas_connector.h b/source/module_base/blas_connector.h index 923f35c87c..5222d59b53 100644 --- a/source/module_base/blas_connector.h +++ b/source/module_base/blas_connector.h @@ -312,18 +312,6 @@ class BlasConnector template static void vector_div_vector(const int& dim, T* result, const T* vector1, const T* vector2, base_device::AbacusDevice_t device_type = base_device::AbacusDevice_t::CpuDevice); - - static - void constantvector_addORsub_constantVector_op(const int& dim, float* result, const float* vector1, const float constant1, const float* vector2, const float constant2, base_device::AbacusDevice_t device_type = base_device::AbacusDevice_t::CpuDevice); - - static - void constantvector_addORsub_constantVector_op(const int& dim, double* result, const double* vector1, const double constant1, const double* vector2, const double constant2, base_device::AbacusDevice_t device_type = base_device::AbacusDevice_t::CpuDevice); - - static - void constantvector_addORsub_constantVector_op(const int& dim, std::complex* result, const std::complex* vector1, const float constant1, const std::complex* vector2, const float constant2, base_device::AbacusDevice_t device_type = base_device::AbacusDevice_t::CpuDevice); - - static - void constantvector_addORsub_constantVector_op(const int& dim, std::complex* result, const std::complex* vector1, const double constant1, const std::complex* vector2, const double constant2, base_device::AbacusDevice_t device_type = base_device::AbacusDevice_t::CpuDevice); }; // If GATHER_INFO is defined, the original function is replaced with a "i" suffix, From f8e9ae6fece13b93976bad43c7c439ff2cecba95 Mon Sep 17 00:00:00 2001 From: critsium-xy Date: Fri, 10 Jan 2025 22:22:01 +0800 Subject: [PATCH 03/12] XX --- source/module_base/blas_connector.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/source/module_base/blas_connector.cpp b/source/module_base/blas_connector.cpp index 723256abd2..ebd42320f0 100644 --- a/source/module_base/blas_connector.cpp +++ b/source/module_base/blas_connector.cpp @@ -655,6 +655,7 @@ void BlasConnector::copy(const long n, const std::complex *a, const int } } + template void vector_mul_vector(const int& dim, T* result, const T* vector1, const T* vector2, base_device::AbacusDevice_t device_type){ using Real = typename GetTypeReal::type; From 6060b43d3c4d0da2c8b673b7e7202cea048c18f2 Mon Sep 17 00:00:00 2001 From: critsium-xy Date: Mon, 13 Jan 2025 19:38:13 +0800 Subject: [PATCH 04/12] Finish CUDA kernel --- source/module_base/blas_connector.cpp | 11 ++++ source/module_base/kernels/cuda/math_op.cu | 65 ++++++++++++++++++++++ 2 files changed, 76 insertions(+) diff --git a/source/module_base/blas_connector.cpp b/source/module_base/blas_connector.cpp index ebd42320f0..122783defb 100644 --- a/source/module_base/blas_connector.cpp +++ b/source/module_base/blas_connector.cpp @@ -13,6 +13,7 @@ #include #include #include "module_base/tool_quit.h" +#include "module_base/kernels/cuda/math_op.cu" #include "cublas_v2.h" @@ -668,6 +669,11 @@ void vector_mul_vector(const int& dim, T* result, const T* vector1, const T* vec result[i] = vector1[i] * vector2[i]; } } + else if (device_type == base_device::AbacusDevice_t::GpuDevice){ +#ifdef __CUDA + vector_mul_vector_complex_wrapper(d, dim, result, vector1, vector2); +#endif + } } @@ -683,4 +689,9 @@ void vector_div_vector(const int& dim, T* result, const T* vector1, const T* vec result[i] = vector1[i] / vector2[i]; } } + else if (device_type == base_device::AbacusDevice_t::GpuDevice){ +#ifdef __CUDA + vector_div_vector_complex_wrapper(d, dim, result, vector1, vector2); +#endif + } } \ No newline at end of file diff --git a/source/module_base/kernels/cuda/math_op.cu b/source/module_base/kernels/cuda/math_op.cu index cee820414e..9f38173cb0 100644 --- a/source/module_base/kernels/cuda/math_op.cu +++ b/source/module_base/kernels/cuda/math_op.cu @@ -154,4 +154,69 @@ void cal_ylm_real_op::operator()(const base_dev template struct cal_ylm_real_op; template struct cal_ylm_real_op; + +// The next are kernels for new blas_connector + + +template +__global__ void vector_mul_vector_kernel( + const int size, + T* result, + const T* vector1, + const typename GetTypeReal::type* vector2) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < size) + { + result[i] = vector1[i] * vector2[i]; + } +} + +template +__global__ void vector_div_vector_kernel( + const int size, + T* result, + const T* vector1, + const typename GetTypeReal::type* vector2) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < size) + { + result[i] = vector1[i] / vector2[i]; + } +} + +template +inline void vector_div_constant_complex_wrapper(const base_device::DEVICE_GPU* d, + const int dim, + std::complex* result, + const std::complex* vector, + const FPTYPE constant) +{ + thrust::complex* result_tmp = reinterpret_cast*>(result); + const thrust::complex* vector_tmp = reinterpret_cast*>(vector); + + int thread = thread_per_block; + int block = (dim + thread - 1) / thread; + vector_div_constant_kernel> <<>> (dim, result_tmp, vector_tmp, constant); + + cudaCheckOnDebug(); +} + +template +inline void vector_mul_vector_complex_wrapper(const base_device::DEVICE_GPU* d, + const int& dim, + std::complex* result, + const std::complex* vector1, + const FPTYPE* vector2) +{ + thrust::complex* result_tmp = reinterpret_cast*>(result); + const thrust::complex* vector1_tmp = reinterpret_cast*>(vector1); + int thread = thread_per_block; + int block = (dim + thread - 1) / thread; + vector_mul_vector_kernel> <<>> (dim, result_tmp, vector1_tmp, vector2); + + cudaCheckOnDebug(); +} + } // namespace ModuleBase From 2bc70974da965161c073f56ac7bd15f6c2955345 Mon Sep 17 00:00:00 2001 From: critsium-xy Date: Mon, 13 Jan 2025 19:44:46 +0800 Subject: [PATCH 05/12] Fix marcos --- source/module_base/kernels/cuda/math_op.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/source/module_base/kernels/cuda/math_op.cu b/source/module_base/kernels/cuda/math_op.cu index 9f38173cb0..9f5b7b397b 100644 --- a/source/module_base/kernels/cuda/math_op.cu +++ b/source/module_base/kernels/cuda/math_op.cu @@ -1,5 +1,6 @@ #include "cuda_runtime.h" #include "module_base/kernels/math_op.h" +#include "module_base/macros.h" #include From 911a80d2a48ed6c44e33e9352d382893cf6c5dbc Mon Sep 17 00:00:00 2001 From: critsium-xy Date: Mon, 13 Jan 2025 22:17:41 +0800 Subject: [PATCH 06/12] Fix typename --- source/module_base/kernels/cuda/math_op.cu | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/source/module_base/kernels/cuda/math_op.cu b/source/module_base/kernels/cuda/math_op.cu index 9f5b7b397b..cd97fcaad7 100644 --- a/source/module_base/kernels/cuda/math_op.cu +++ b/source/module_base/kernels/cuda/math_op.cu @@ -3,6 +3,10 @@ #include "module_base/macros.h" #include +#include +#include +#include +#include namespace ModuleBase { @@ -197,7 +201,7 @@ inline void vector_div_constant_complex_wrapper(const base_device::DEVICE_GPU* d thrust::complex* result_tmp = reinterpret_cast*>(result); const thrust::complex* vector_tmp = reinterpret_cast*>(vector); - int thread = thread_per_block; + int thread = THREADS_PER_BLOCK; int block = (dim + thread - 1) / thread; vector_div_constant_kernel> <<>> (dim, result_tmp, vector_tmp, constant); @@ -213,7 +217,7 @@ inline void vector_mul_vector_complex_wrapper(const base_device::DEVICE_GPU* d, { thrust::complex* result_tmp = reinterpret_cast*>(result); const thrust::complex* vector1_tmp = reinterpret_cast*>(vector1); - int thread = thread_per_block; + int thread = THREADS_PER_BLOCK; int block = (dim + thread - 1) / thread; vector_mul_vector_kernel> <<>> (dim, result_tmp, vector1_tmp, vector2); From 7646974763d28a93f2cee92748a72ebb28a33f76 Mon Sep 17 00:00:00 2001 From: critsium-xy Date: Tue, 14 Jan 2025 00:55:45 +0800 Subject: [PATCH 07/12] GPU implementation --- source/module_base/blas_connector.cpp | 4 +- source/module_base/kernels/cuda/math_op.cu | 79 ++++++++++++++++++++-- 2 files changed, 74 insertions(+), 9 deletions(-) diff --git a/source/module_base/blas_connector.cpp b/source/module_base/blas_connector.cpp index 122783defb..506b7c7e3f 100644 --- a/source/module_base/blas_connector.cpp +++ b/source/module_base/blas_connector.cpp @@ -671,7 +671,7 @@ void vector_mul_vector(const int& dim, T* result, const T* vector1, const T* vec } else if (device_type == base_device::AbacusDevice_t::GpuDevice){ #ifdef __CUDA - vector_mul_vector_complex_wrapper(d, dim, result, vector1, vector2); + vector_mul_vector_gpu(dim, result, vector1, vector2); #endif } } @@ -691,7 +691,7 @@ void vector_div_vector(const int& dim, T* result, const T* vector1, const T* vec } else if (device_type == base_device::AbacusDevice_t::GpuDevice){ #ifdef __CUDA - vector_div_vector_complex_wrapper(d, dim, result, vector1, vector2); + vector_mul_vector_gpu(dim, result, vector1, vector2); #endif } } \ No newline at end of file diff --git a/source/module_base/kernels/cuda/math_op.cu b/source/module_base/kernels/cuda/math_op.cu index cd97fcaad7..a7438747e5 100644 --- a/source/module_base/kernels/cuda/math_op.cu +++ b/source/module_base/kernels/cuda/math_op.cu @@ -192,25 +192,22 @@ __global__ void vector_div_vector_kernel( } template -inline void vector_div_constant_complex_wrapper(const base_device::DEVICE_GPU* d, - const int dim, +inline void vector_div_vector_complex_wrapper(const int dim, std::complex* result, const std::complex* vector, const FPTYPE constant) { thrust::complex* result_tmp = reinterpret_cast*>(result); - const thrust::complex* vector_tmp = reinterpret_cast*>(vector); - + const thrust::complex* vector1_tmp = reinterpret_cast*>(vector1); int thread = THREADS_PER_BLOCK; int block = (dim + thread - 1) / thread; - vector_div_constant_kernel> <<>> (dim, result_tmp, vector_tmp, constant); + vector_div_vector_kernel> <<>> (dim, result_tmp, vector1_tmp, vector2); cudaCheckOnDebug(); } template -inline void vector_mul_vector_complex_wrapper(const base_device::DEVICE_GPU* d, - const int& dim, +inline void vector_mul_vector_complex_wrapper(const int& dim, std::complex* result, const std::complex* vector1, const FPTYPE* vector2) @@ -224,4 +221,72 @@ inline void vector_mul_vector_complex_wrapper(const base_device::DEVICE_GPU* d, cudaCheckOnDebug(); } +void vector_div_vector_gpu(const int& dim, + double* result, + const double* vector1, + const double* vector2) +{ + int thread = THREADS_PER_BLOCK; + int block = (dim + thread - 1) / thread; + vector_div_vector_kernel <<>> (dim, result, vector1, vector2); + + cudaCheckOnDebug(); +} + +void vector_div_vector_gpu(const int& dim, + float* result, + const float* vector1, + const float* vector2) +{ + int thread = THREADS_PER_BLOCK; + int block = (dim + thread - 1) / thread; + vector_div_vector_kernel <<>> (dim, result, vector1, vector2); + + cudaCheckOnDebug(); +} + +void vector_div_vector_gpu(const int& dim, std::complex* result, const std::complex* vector1, const float* vector2) +{ + vector_div_vector_complex_wrapper(dim, result, vector1, vector2); +} + +void vector_div_vector_gpu(const int& dim, std::complex* result, const std::complex* vector1, const double* vector2) +{ + vector_div_vector_complex_wrapper(dim, result, vector1, vector2); +} + +void vector_mul_vector_gpu(const int& dim, + double* result, + const double* vector1, + const double* vector2) +{ + int thread = THREADS_PER_BLOCK; + int block = (dim + thread - 1) / thread; + vector_mul_vector_kernel <<>> (dim, result, vector1, vector2); + + cudaCheckOnDebug(); +} + +void vector_mul_vector_gpu(const int& dim, + float* result, + const float* vector1, + const float* vector2) +{ + int thread = THREADS_PER_BLOCK; + int block = (dim + thread - 1) / thread; + vector_mul_vector_kernel <<>> (dim, result, vector1, vector2); + + cudaCheckOnDebug(); +} + +void vector_mul_vector_gpu(const int& dim, std::complex* result, const std::complex* vector1, const float* vector2) +{ + vector_mul_vector_complex_wrapper(dim, result, vector1, vector2); +} + +void vector_mul_vector_gpu(const int& dim, std::complex* result, const std::complex* vector1, const double* vector2) +{ + vector_mul_vector_complex_wrapper(dim, result, vector1, vector2); +} + } // namespace ModuleBase From 91e8dc21bfb69c8f464df3648daa0face178f5ec Mon Sep 17 00:00:00 2001 From: critsium-xy Date: Tue, 14 Jan 2025 01:57:10 +0800 Subject: [PATCH 08/12] Fix bugs --- source/module_base/blas_connector.cpp | 13 +- source/module_base/kernels/cuda/math_op.cu | 137 +-------------------- 2 files changed, 6 insertions(+), 144 deletions(-) diff --git a/source/module_base/blas_connector.cpp b/source/module_base/blas_connector.cpp index 506b7c7e3f..8a8f65f5ec 100644 --- a/source/module_base/blas_connector.cpp +++ b/source/module_base/blas_connector.cpp @@ -9,13 +9,10 @@ #ifdef __CUDA #include #include -#include -#include -#include -#include "module_base/tool_quit.h" -#include "module_base/kernels/cuda/math_op.cu" - #include "cublas_v2.h" +#include "module_hsolver/kernels/math_kernel_op.h" +#include "module_base/module_device/memory_op.h" + namespace BlasUtils{ @@ -671,7 +668,7 @@ void vector_mul_vector(const int& dim, T* result, const T* vector1, const T* vec } else if (device_type == base_device::AbacusDevice_t::GpuDevice){ #ifdef __CUDA - vector_mul_vector_gpu(dim, result, vector1, vector2); + hsolver::vector_mul_vector_op()(gpu_ctx, dim, result, vector1, vector2); #endif } } @@ -691,7 +688,7 @@ void vector_div_vector(const int& dim, T* result, const T* vector1, const T* vec } else if (device_type == base_device::AbacusDevice_t::GpuDevice){ #ifdef __CUDA - vector_mul_vector_gpu(dim, result, vector1, vector2); + hsolver::vector_div_vector_op()(gpu_ctx, dim, result, vector1, vector2); #endif } } \ No newline at end of file diff --git a/source/module_base/kernels/cuda/math_op.cu b/source/module_base/kernels/cuda/math_op.cu index a7438747e5..cef3c04f3f 100644 --- a/source/module_base/kernels/cuda/math_op.cu +++ b/source/module_base/kernels/cuda/math_op.cu @@ -1,12 +1,7 @@ -#include "cuda_runtime.h" +#include #include "module_base/kernels/math_op.h" -#include "module_base/macros.h" #include -#include -#include -#include -#include namespace ModuleBase { @@ -159,134 +154,4 @@ void cal_ylm_real_op::operator()(const base_dev template struct cal_ylm_real_op; template struct cal_ylm_real_op; - -// The next are kernels for new blas_connector - - -template -__global__ void vector_mul_vector_kernel( - const int size, - T* result, - const T* vector1, - const typename GetTypeReal::type* vector2) -{ - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < size) - { - result[i] = vector1[i] * vector2[i]; - } -} - -template -__global__ void vector_div_vector_kernel( - const int size, - T* result, - const T* vector1, - const typename GetTypeReal::type* vector2) -{ - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < size) - { - result[i] = vector1[i] / vector2[i]; - } -} - -template -inline void vector_div_vector_complex_wrapper(const int dim, - std::complex* result, - const std::complex* vector, - const FPTYPE constant) -{ - thrust::complex* result_tmp = reinterpret_cast*>(result); - const thrust::complex* vector1_tmp = reinterpret_cast*>(vector1); - int thread = THREADS_PER_BLOCK; - int block = (dim + thread - 1) / thread; - vector_div_vector_kernel> <<>> (dim, result_tmp, vector1_tmp, vector2); - - cudaCheckOnDebug(); -} - -template -inline void vector_mul_vector_complex_wrapper(const int& dim, - std::complex* result, - const std::complex* vector1, - const FPTYPE* vector2) -{ - thrust::complex* result_tmp = reinterpret_cast*>(result); - const thrust::complex* vector1_tmp = reinterpret_cast*>(vector1); - int thread = THREADS_PER_BLOCK; - int block = (dim + thread - 1) / thread; - vector_mul_vector_kernel> <<>> (dim, result_tmp, vector1_tmp, vector2); - - cudaCheckOnDebug(); -} - -void vector_div_vector_gpu(const int& dim, - double* result, - const double* vector1, - const double* vector2) -{ - int thread = THREADS_PER_BLOCK; - int block = (dim + thread - 1) / thread; - vector_div_vector_kernel <<>> (dim, result, vector1, vector2); - - cudaCheckOnDebug(); -} - -void vector_div_vector_gpu(const int& dim, - float* result, - const float* vector1, - const float* vector2) -{ - int thread = THREADS_PER_BLOCK; - int block = (dim + thread - 1) / thread; - vector_div_vector_kernel <<>> (dim, result, vector1, vector2); - - cudaCheckOnDebug(); -} - -void vector_div_vector_gpu(const int& dim, std::complex* result, const std::complex* vector1, const float* vector2) -{ - vector_div_vector_complex_wrapper(dim, result, vector1, vector2); -} - -void vector_div_vector_gpu(const int& dim, std::complex* result, const std::complex* vector1, const double* vector2) -{ - vector_div_vector_complex_wrapper(dim, result, vector1, vector2); -} - -void vector_mul_vector_gpu(const int& dim, - double* result, - const double* vector1, - const double* vector2) -{ - int thread = THREADS_PER_BLOCK; - int block = (dim + thread - 1) / thread; - vector_mul_vector_kernel <<>> (dim, result, vector1, vector2); - - cudaCheckOnDebug(); -} - -void vector_mul_vector_gpu(const int& dim, - float* result, - const float* vector1, - const float* vector2) -{ - int thread = THREADS_PER_BLOCK; - int block = (dim + thread - 1) / thread; - vector_mul_vector_kernel <<>> (dim, result, vector1, vector2); - - cudaCheckOnDebug(); -} - -void vector_mul_vector_gpu(const int& dim, std::complex* result, const std::complex* vector1, const float* vector2) -{ - vector_mul_vector_complex_wrapper(dim, result, vector1, vector2); -} - -void vector_mul_vector_gpu(const int& dim, std::complex* result, const std::complex* vector1, const double* vector2) -{ - vector_mul_vector_complex_wrapper(dim, result, vector1, vector2); -} - } // namespace ModuleBase From 31d4dff254bc99895516ddca4654081430bdc20b Mon Sep 17 00:00:00 2001 From: critsium-xy Date: Tue, 14 Jan 2025 13:37:21 +0800 Subject: [PATCH 09/12] add vector_add_vector kernel --- source/module_base/blas_connector.cpp | 72 +++++++++++++++++++ source/module_base/blas_connector.h | 13 ++++ .../kernels/cuda/math_kernel_op.cu | 1 + 3 files changed, 86 insertions(+) diff --git a/source/module_base/blas_connector.cpp b/source/module_base/blas_connector.cpp index 8a8f65f5ec..14fb76e2ed 100644 --- a/source/module_base/blas_connector.cpp +++ b/source/module_base/blas_connector.cpp @@ -691,4 +691,76 @@ void vector_div_vector(const int& dim, T* result, const T* vector1, const T* vec hsolver::vector_div_vector_op()(gpu_ctx, dim, result, vector1, vector2); #endif } +} + +void vector_add_vector(const int& dim, float *result, const float *vector1, const float constant1, const float *vector2, const float constant2, base_device::AbacusDevice_t device_type) +{ + if (device_type == base_device::CpuDevice){ +#ifdef _OPENMP +#pragma omp parallel for schedule(static, 8192 / sizeof(float)) +#endif + for (int i = 0; i < dim; i++) + { + result[i] = vector1[i] * constant1 + vector2[i] * constant2; + } + } + else if (device_type == base_device::GpuDevice){ +#ifdef __CUDA + hsolver::constantvector_addORsub_constantVector_op()(gpu_ctx, dim, result, vector1, constant1, vector2, constant2); +#endif + } +} + +void vector_add_vector(const int& dim, double *result, const double *vector1, const double constant1, const double *vector2, const double constant2, base_device::AbacusDevice_t device_type) +{ + if (device_type == base_device::CpuDevice){ +#ifdef _OPENMP +#pragma omp parallel for schedule(static, 8192 / sizeof(double)) +#endif + for (int i = 0; i < dim; i++) + { + result[i] = vector1[i] * constant1 + vector2[i] * constant2; + } + } + else if (device_type == base_device::GpuDevice){ +#ifdef __CUDA + hsolver::constantvector_addORsub_constantVector_op()(gpu_ctx, dim, result, vector1, constant1, vector2, constant2); +#endif + } +} + +void vector_add_vector(const int& dim, std::complex *result, const std::complex *vector1, const float constant1, const std::complex *vector2, const float constant2, base_device::AbacusDevice_t device_type) +{ + if (device_type == base_device::CpuDevice){ +#ifdef _OPENMP +#pragma omp parallel for schedule(static, 8192 / sizeof(std::complex)) +#endif + for (int i = 0; i < dim; i++) + { + result[i] = vector1[i] * constant1 + vector2[i] * constant2; + } + } + else if (device_type == base_device::GpuDevice){ +#ifdef __CUDA + hsolver::constantvector_addORsub_constantVector_op, base_device::DEVICE_GPU>()(gpu_ctx, dim, result, vector1, constant1, vector2, constant2); +#endif + } +} + +void vector_add_vector(const int& dim, std::complex *result, const std::complex *vector1, const double constant1, const std::complex *vector2, const double constant2, base_device::AbacusDevice_t device_type) +{ + if (device_type == base_device::CpuDevice){ +#ifdef _OPENMP +#pragma omp parallel for schedule(static, 8192 / sizeof(std::complex)) +#endif + for (int i = 0; i < dim; i++) + { + result[i] = vector1[i] * constant1 + vector2[i] * constant2; + } + } + else if (device_type == base_device::GpuDevice){ +#ifdef __CUDA + hsolver::constantvector_addORsub_constantVector_op, base_device::DEVICE_GPU>()(gpu_ctx, dim, result, vector1, constant1, vector2, constant2); +#endif + } } \ No newline at end of file diff --git a/source/module_base/blas_connector.h b/source/module_base/blas_connector.h index 5222d59b53..809f80c092 100644 --- a/source/module_base/blas_connector.h +++ b/source/module_base/blas_connector.h @@ -312,6 +312,19 @@ class BlasConnector template static void vector_div_vector(const int& dim, T* result, const T* vector1, const T* vector2, base_device::AbacusDevice_t device_type = base_device::AbacusDevice_t::CpuDevice); + + // y = alpha * x + beta * y + static + void vector_add_vector(const int& dim, float *result, const float *vector1, const float constant1, const float *vector2, const float constant2, base_device::AbacusDevice_t device_type = base_device::AbacusDevice_t::CpuDevice); + + static + void vector_add_vector(const int& dim, double *result, const double *vector1, const double constant1, const double *vector2, const double constant2, base_device::AbacusDevice_t device_type = base_device::AbacusDevice_t::CpuDevice); + + static + void vector_add_vector(const int& dim, std::complex *result, const std::complex *vector1, const float constant1, const std::complex *vector2, const float constant2, base_device::AbacusDevice_t device_type = base_device::AbacusDevice_t::CpuDevice); + + static + void vector_add_vector(const int& dim, std::complex *result, const std::complex *vector1, const double constant1, const std::complex *vector2, const double constant2, base_device::AbacusDevice_t device_type = base_device::AbacusDevice_t::CpuDevice); }; // If GATHER_INFO is defined, the original function is replaced with a "i" suffix, diff --git a/source/module_hsolver/kernels/cuda/math_kernel_op.cu b/source/module_hsolver/kernels/cuda/math_kernel_op.cu index 149b9ce389..70ed5ebf0b 100644 --- a/source/module_hsolver/kernels/cuda/math_kernel_op.cu +++ b/source/module_hsolver/kernels/cuda/math_kernel_op.cu @@ -1043,6 +1043,7 @@ template struct line_minimize_with_block_op, base_device::DE template struct vector_div_constant_op, base_device::DEVICE_GPU>; template struct vector_mul_vector_op, base_device::DEVICE_GPU>; template struct vector_div_vector_op, base_device::DEVICE_GPU>; +template struct constantvector_addORsub_constantVector_op; template struct constantvector_addORsub_constantVector_op, base_device::DEVICE_GPU>; template struct matrixSetToAnother, base_device::DEVICE_GPU>; From c778f26c34f8054c659c16626d037016609af0c7 Mon Sep 17 00:00:00 2001 From: critsium-xy Date: Wed, 15 Jan 2025 14:00:02 +0800 Subject: [PATCH 10/12] Add blas_connector CPU tests --- .../module_base/test/blas_connector_test.cpp | 132 +++++++++++++++++- 1 file changed, 130 insertions(+), 2 deletions(-) diff --git a/source/module_base/test/blas_connector_test.cpp b/source/module_base/test/blas_connector_test.cpp index cf445ecb7a..51675ed56f 100644 --- a/source/module_base/test/blas_connector_test.cpp +++ b/source/module_base/test/blas_connector_test.cpp @@ -1,4 +1,5 @@ #include "../blas_connector.h" +#include "../module_device/memory_op.h" #include "gtest/gtest.h" #include @@ -84,8 +85,7 @@ TEST(blas_connector, Scal) { }; for (int i = 0; i < size; i++) answer[i] = result[i] * scale; - BlasConnector bs; - bs.scal(size,scale,result,incx); + BlasConnector::scal(size,scale,result,incx); // incx is the spacing between elements if result for (int i = 0; i < size; i++) { EXPECT_DOUBLE_EQ(answer[i].real(), result[i].real()); @@ -313,6 +313,84 @@ TEST(blas_connector, zgemv_) { } } +TEST(blas_connector, Gemv) { + typedef std::complex T; + const char transa_m = 'N'; + const char transa_n = 'T'; + const char transa_h = 'C'; + const int size_m = 3; + const int size_n = 4; + const T alpha_const = {2, 3}; + const T beta_const = {3, 4}; + const int lda = 5; + const int incx = 1; + const int incy = 1; + std::array x_const_m, x_const_c, result_m, answer_m, c_dot_m{}; + std::array x_const_n, result_n, result_c, answer_n, answer_c, + c_dot_n{}, c_dot_c{}; + std::generate(x_const_n.begin(), x_const_n.end(), []() { + return T{static_cast(std::rand() / double(RAND_MAX)), + static_cast(std::rand() / double(RAND_MAX))}; + }); + std::generate(result_n.begin(), result_n.end(), []() { + return T{static_cast(std::rand() / double(RAND_MAX)), + static_cast(std::rand() / double(RAND_MAX))}; + }); + std::generate(x_const_m.begin(), x_const_m.end(), []() { + return T{static_cast(std::rand() / double(RAND_MAX)), + static_cast(std::rand() / double(RAND_MAX))}; + }); + std::generate(result_m.begin(), result_m.end(), []() { + return T{static_cast(std::rand() / double(RAND_MAX)), + static_cast(std::rand() / double(RAND_MAX))}; + }); + std::array a_const; + std::generate(a_const.begin(), a_const.end(), []() { + return T{static_cast(std::rand() / double(RAND_MAX)), + static_cast(std::rand() / double(RAND_MAX))}; + }); + for (int i = 0; i < size_m; i++) { + for (int j = 0; j < size_n; j++) { + c_dot_m[i] += a_const[i + j * lda] * x_const_n[j]; + } + answer_m[i] = alpha_const * c_dot_m[i] + beta_const * result_m[i]; + } + BlasConnector::gemv(transa_m, size_m, size_n, alpha_const, a_const.data(), lda, + x_const_n.data(), incx, beta_const, result_m.data(), incy); + + for (int j = 0; j < size_n; j++) { + for (int i = 0; i < size_m; i++) { + c_dot_n[j] += a_const[i + j * lda] * x_const_m[i]; + } + answer_n[j] = alpha_const * c_dot_n[j] + beta_const * result_n[j]; + } + BlasConnector::gemv(transa_n, size_m, size_n, alpha_const, a_const.data(), lda, + x_const_n.data(), incx, beta_const, result_m.data(), incy); + + for (int j = 0; j < size_n; j++) { + for (int i = 0; i < size_m; i++) { + c_dot_c[j] += conj(a_const[i + j * lda]) * x_const_c[i]; + } + answer_c[j] = alpha_const * c_dot_c[j] + beta_const * result_c[j]; + } + BlasConnector::gemv(transa_h, size_m, size_n, alpha_const, a_const.data(), lda, + x_const_n.data(), incx, beta_const, result_m.data(), incy); + + for (int i = 0; i < size_m; i++) { + EXPECT_DOUBLE_EQ(answer_m[i].real(), result_m[i].real()); + EXPECT_DOUBLE_EQ(answer_m[i].imag(), result_m[i].imag()); + } + for (int j = 0; j < size_n; j++) { + EXPECT_DOUBLE_EQ(answer_n[j].real(), result_n[j].real()); + EXPECT_DOUBLE_EQ(answer_n[j].imag(), result_n[j].imag()); + } + for (int j = 0; j < size_n; j++) { + EXPECT_DOUBLE_EQ(answer_c[j].real(), result_c[j].real()); + EXPECT_DOUBLE_EQ(answer_c[j].imag(), result_c[j].imag()); + } +} + + TEST(blas_connector, dgemm_) { typedef double T; const char transa_m = 'N'; @@ -404,6 +482,56 @@ TEST(blas_connector, zgemm_) { } } +TEST(blas_connector, Gemm) { + typedef std::complex T; + const char transa_m = 'N'; + const char transb_m = 'N'; + const int size_m = 3; + const int size_n = 4; + const int size_k = 5; + const T alpha_const = {2, 3}; + const T beta_const = {3, 4}; + const int lda = 6; + const int ldb = 5; + const int ldc = 4; + std::array a_const; + std::array b_const; + std::array c_dot{}, answer, result; + std::generate(a_const.begin(), a_const.end(), []() { + return T{static_cast(std::rand() / double(RAND_MAX)), + static_cast(std::rand() / double(RAND_MAX))}; + }); + std::generate(b_const.begin(), b_const.end(), []() { + return T{static_cast(std::rand() / double(RAND_MAX)), + static_cast(std::rand() / double(RAND_MAX))}; + }); + std::generate(result.begin(), result.end(), []() { + return T{static_cast(std::rand() / double(RAND_MAX)), + static_cast(std::rand() / double(RAND_MAX))}; + }); + for (int i = 0; i < size_m; i++) { + for (int j = 0; j < size_n; j++) { + for (int k = 0; k < size_k; k++) { + c_dot[i + j * ldc] += + a_const[i + k * lda] * b_const[k + j * ldb]; + } + answer[i + j * ldc] = alpha_const * c_dot[i + j * ldc] + + beta_const * result[i + j * ldc]; + } + } + BlasConnector::gemm(transa_m, transb_m, size_m, size_n, size_k, alpha_const, + a_const.data(), lda, b_const.data(), ldb, beta_const, + result.data(), ldc); + + for (int i = 0; i < size_m; i++) + for (int j = 0; j < size_n; j++) { + EXPECT_DOUBLE_EQ(answer[i + j * ldc].real(), + result[i + j * ldc].real()); + EXPECT_DOUBLE_EQ(answer[i + j * ldc].imag(), + result[i + j * ldc].imag()); + } +} + int main(int argc, char **argv) { testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); From bc8ed09b9d5fc63efb80167f2cf3fadc138ad384 Mon Sep 17 00:00:00 2001 From: critsium-xy Date: Wed, 15 Jan 2025 14:13:39 +0800 Subject: [PATCH 11/12] Fix blas usgae --- source/module_base/test/blas_connector_test.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/source/module_base/test/blas_connector_test.cpp b/source/module_base/test/blas_connector_test.cpp index 51675ed56f..2368d6dda1 100644 --- a/source/module_base/test/blas_connector_test.cpp +++ b/source/module_base/test/blas_connector_test.cpp @@ -365,7 +365,7 @@ TEST(blas_connector, Gemv) { answer_n[j] = alpha_const * c_dot_n[j] + beta_const * result_n[j]; } BlasConnector::gemv(transa_n, size_m, size_n, alpha_const, a_const.data(), lda, - x_const_n.data(), incx, beta_const, result_m.data(), incy); + x_const_m.data(), incx, beta_const, result_n.data(), incy); for (int j = 0; j < size_n; j++) { for (int i = 0; i < size_m; i++) { @@ -374,7 +374,7 @@ TEST(blas_connector, Gemv) { answer_c[j] = alpha_const * c_dot_c[j] + beta_const * result_c[j]; } BlasConnector::gemv(transa_h, size_m, size_n, alpha_const, a_const.data(), lda, - x_const_n.data(), incx, beta_const, result_m.data(), incy); + x_const_c.data(), incx, beta_const, result_c.data(), incy); for (int i = 0; i < size_m; i++) { EXPECT_DOUBLE_EQ(answer_m[i].real(), result_m[i].real()); @@ -519,7 +519,7 @@ TEST(blas_connector, Gemm) { beta_const * result[i + j * ldc]; } } - BlasConnector::gemm(transa_m, transb_m, size_m, size_n, size_k, alpha_const, + BlasConnector::gemm_cm(transa_m, transb_m, size_m, size_n, size_k, alpha_const, a_const.data(), lda, b_const.data(), ldb, beta_const, result.data(), ldc); From d5e9b6c8269a8cece1e6cc524e43ac0e03668c62 Mon Sep 17 00:00:00 2001 From: critsium-xy Date: Wed, 15 Jan 2025 16:17:42 +0800 Subject: [PATCH 12/12] Add initializer and GPU tests --- source/module_base/blas_connector.h | 9 + .../module_base/test/blas_connector_test.cpp | 159 ++++++++++++++++++ 2 files changed, 168 insertions(+) diff --git a/source/module_base/blas_connector.h b/source/module_base/blas_connector.h index 809f80c092..387ff32a46 100644 --- a/source/module_base/blas_connector.h +++ b/source/module_base/blas_connector.h @@ -327,6 +327,15 @@ class BlasConnector void vector_add_vector(const int& dim, std::complex *result, const std::complex *vector1, const double constant1, const std::complex *vector2, const double constant2, base_device::AbacusDevice_t device_type = base_device::AbacusDevice_t::CpuDevice); }; +#ifdef __CUDA + +namespace BlasUtils{ + void createGpuBlasHandle(); + void destoryBLAShandle(); +} + +#endif + // If GATHER_INFO is defined, the original function is replaced with a "i" suffix, // preventing changes on the original code. // The real function call is at gather_math_lib_info.cpp diff --git a/source/module_base/test/blas_connector_test.cpp b/source/module_base/test/blas_connector_test.cpp index 2368d6dda1..34f4cb51bb 100644 --- a/source/module_base/test/blas_connector_test.cpp +++ b/source/module_base/test/blas_connector_test.cpp @@ -93,6 +93,33 @@ TEST(blas_connector, Scal) { } } +#ifdef __CUDA + +TEST(blas_connector, ScalGpu) { + const int size = 8; + const std::complex scale = {2, 3}; + const int incx = 1; + std::complex result[8], answer[8]; + std::complex* result_gpu = nullptr; + resmem_zd_op()(gpu_ctx, result_gpu, 8 * sizeof(std::complex)); + for (int i=0; i< size; i++) { + result[i] = std::complex{static_cast(std::rand() / double(RAND_MAX)), + static_cast(std::rand() / double(RAND_MAX))}; + }; + for (int i = 0; i < size; i++) + answer[i] = result[i] * scale; + syncmem_z2z_h2d_op()(gpu_ctx, cpu_ctx, result_gpu, result, sizeof(std::complex) * 8); + BlasConnector::scal(size,scale,result_gpu,incx,base_device::AbacusDevice_t::GpuDevice); + syncmem_z2z_d2h_op()(cpu_ctx, gpu_ctx, result, result_gpu, sizeof(std::complex) * 8); + delmem_zd_op()(gpu_ctx, result_gpu); + // incx is the spacing between elements if result + for (int i = 0; i < size; i++) { + EXPECT_DOUBLE_EQ(answer[i].real(), result[i].real()); + EXPECT_DOUBLE_EQ(answer[i].imag(), result[i].imag()); + } +} + +#endif TEST(blas_connector, daxpy_) { typedef double T; @@ -136,6 +163,67 @@ TEST(blas_connector, zaxpy_) { } } +TEST(blas_connector, Axpy) { + typedef std::complex T; + const int size = 8; + const T scale = {2, 3}; + const int incx = 1; + const int incy = 1; + std::array x_const, result, answer; + std::generate(x_const.begin(), x_const.end(), []() { + return T{static_cast(std::rand() / double(RAND_MAX)), + static_cast(std::rand() / double(RAND_MAX))}; + }); + std::generate(result.begin(), result.end(), []() { + return T{static_cast(std::rand() / double(RAND_MAX)), + static_cast(std::rand() / double(RAND_MAX))}; + }); + for (int i = 0; i < size; i++) + answer[i] = x_const[i] * scale + result[i]; + BlasConnector::axpy(size, scale, x_const.data(), incx, result.data(), incy); + for (int i = 0; i < size; i++) { + EXPECT_DOUBLE_EQ(answer[i].real(), result[i].real()); + EXPECT_DOUBLE_EQ(answer[i].imag(), result[i].imag()); + } +} + +#ifdef __CUDA + +TEST(blas_connector, AxpyGpu) { + typedef std::complex T; + const int size = 8; + const T scale = {2, 3}; + const int incx = 1; + const int incy = 1; + std::array x_const, result, answer; + T* x_gpu = nullptr; + T* result_gpu = nullptr; + resmem_zd_op()(gpu_ctx, x_gpu, size * sizeof(std::complex)); + resmem_zd_op()(gpu_ctx, result_gpu, size * sizeof(std::complex)); + std::generate(x_const.begin(), x_const.end(), []() { + return T{static_cast(std::rand() / double(RAND_MAX)), + static_cast(std::rand() / double(RAND_MAX))}; + }); + std::generate(result.begin(), result.end(), []() { + return T{static_cast(std::rand() / double(RAND_MAX)), + static_cast(std::rand() / double(RAND_MAX))}; + }); + for (int i = 0; i < size; i++) + answer[i] = x_const[i] * scale + result[i]; + syncmem_z2z_h2d_op()(gpu_ctx, cpu_ctx, result_gpu, result.data(), sizeof(std::complex) * size); + syncmem_z2z_h2d_op()(gpu_ctx, cpu_ctx, x_gpu, x_const.data(), sizeof(std::complex) * size); + BlasConnector::axpy(size, scale, x_gpu, incx, result_gpu, incy, base_device::AbacusDevice_t::GpuDevice); + syncmem_z2z_d2h_op()(cpu_ctx, gpu_ctx, result.data(), result_gpu, sizeof(std::complex) * size); + delmem_zd_op()(gpu_ctx, result_gpu); + delmem_zd_op()(gpu_ctx, x_gpu); + for (int i = 0; i < size; i++) { + EXPECT_DOUBLE_EQ(answer[i].real(), result[i].real()); + EXPECT_DOUBLE_EQ(answer[i].imag(), result[i].imag()); + } +} + +#endif + TEST(blas_connector, dcopy_) { typedef double T; long const size = 8; @@ -532,7 +620,78 @@ TEST(blas_connector, Gemm) { } } +#ifdef __CUDA + +TEST(blas_connector, GemmGpu) { + typedef std::complex T; + const char transa_m = 'N'; + const char transb_m = 'N'; + const int size_m = 3; + const int size_n = 4; + const int size_k = 5; + const T alpha_const = {2, 3}; + const T beta_const = {3, 4}; + const int lda = 6; + const int ldb = 5; + const int ldc = 4; + std::array a_const; + std::array b_const; + std::array c_dot{}, answer, result; + std::complex* a_gpu = nullptr; + std::complex* b_gpu = nullptr; + std::complex* result_gpu = nullptr; + resmem_zd_op()(gpu_ctx, a_gpu, size_k * lda * sizeof(std::complex)); + resmem_zd_op()(gpu_ctx, b_gpu, size_n * ldb * sizeof(std::complex)); + resmem_zd_op()(gpu_ctx, result_gpu, size_n * ldc * sizeof(std::complex)); + std::generate(a_const.begin(), a_const.end(), []() { + return T{static_cast(std::rand() / double(RAND_MAX)), + static_cast(std::rand() / double(RAND_MAX))}; + }); + std::generate(b_const.begin(), b_const.end(), []() { + return T{static_cast(std::rand() / double(RAND_MAX)), + static_cast(std::rand() / double(RAND_MAX))}; + }); + std::generate(result.begin(), result.end(), []() { + return T{static_cast(std::rand() / double(RAND_MAX)), + static_cast(std::rand() / double(RAND_MAX))}; + }); + for (int i = 0; i < size_m; i++) { + for (int j = 0; j < size_n; j++) { + for (int k = 0; k < size_k; k++) { + c_dot[i + j * ldc] += + a_const[i + k * lda] * b_const[k + j * ldb]; + } + answer[i + j * ldc] = alpha_const * c_dot[i + j * ldc] + + beta_const * result[i + j * ldc]; + } + } + syncmem_z2z_h2d_op()(gpu_ctx, cpu_ctx, a_gpu, a_const.data(), sizeof(std::complex) * size_k * lda); + syncmem_z2z_h2d_op()(gpu_ctx, cpu_ctx, b_gpu, b_const.data(), sizeof(std::complex) * size_n * ldb); + syncmem_z2z_h2d_op()(gpu_ctx, cpu_ctx, result_gpu, result.data(), sizeof(std::complex) * size_n * ldc); + BlasConnector::gemm_cm(transa_m, transb_m, size_m, size_n, size_k, alpha_const, + a_gpu, lda, b_gpu, ldb, beta_const, + result_gpu, ldc, base_device::AbacusDevice_t::GpuDevice); + syncmem_z2z_d2h_op()(cpu_ctx, gpu_ctx, result.data(), result_gpu, sizeof(std::complex) * size_n * ldc); + delmem_zd_op()(gpu_ctx, result_gpu); + delmem_zd_op()(gpu_ctx, a_gpu); + delmem_zd_op()(gpu_ctx, b_gpu); + for (int i = 0; i < size_m; i++) + for (int j = 0; j < size_n; j++) { + EXPECT_DOUBLE_EQ(answer[i + j * ldc].real(), + result[i + j * ldc].real()); + EXPECT_DOUBLE_EQ(answer[i + j * ldc].imag(), + result[i + j * ldc].imag()); + } +} + +#endif + int main(int argc, char **argv) { +#ifdef __CUDA + std::cout << "Initializing CublasHandle..." << std::endl; + BlasUtils::createGpuBlasHandle(); + std::cout << "Initializing CublasHandle Done." << std::endl; +#endif testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); }