Skip to content

Commit 840fdd0

Browse files
Critsium-xyFisherd99
authored andcommitted
[Feature] Add vector_mul_vector, vector_div_vector and vector_add_vector in blas_connector and added some GPU tests. (deepmodeling#5858)
* Added some other necessary kernels * Fix compiling bug * XX * Finish CUDA kernel * Fix marcos * Fix typename * GPU implementation * Fix bugs * add vector_add_vector kernel * Add blas_connector CPU tests * Fix blas usgae * Add initializer and GPU tests
1 parent a77e30c commit 840fdd0

File tree

5 files changed

+438
-9
lines changed

5 files changed

+438
-9
lines changed

source/module_base/blas_connector.cpp

Lines changed: 116 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#include "blas_connector.h"
2+
#include "macros.h"
23

34
#ifdef __DSP
45
#include "module_base/kernels/dsp/dsp_connector.h"
@@ -8,12 +9,10 @@
89
#ifdef __CUDA
910
#include <base/macros/macros.h>
1011
#include <cuda_runtime.h>
11-
#include <thrust/complex.h>
12-
#include <thrust/execution_policy.h>
13-
#include <thrust/inner_product.h>
14-
#include "module_base/tool_quit.h"
15-
1612
#include "cublas_v2.h"
13+
#include "module_hsolver/kernels/math_kernel_op.h"
14+
#include "module_base/module_device/memory_op.h"
15+
1716

1817
namespace BlasUtils{
1918

@@ -652,4 +651,116 @@ void BlasConnector::copy(const long n, const std::complex<double> *a, const int
652651
if (device_type == base_device::AbacusDevice_t::CpuDevice) {
653652
zcopy_(&n, a, &incx, b, &incy);
654653
}
654+
}
655+
656+
657+
template <typename T>
658+
void vector_mul_vector(const int& dim, T* result, const T* vector1, const T* vector2, base_device::AbacusDevice_t device_type){
659+
using Real = typename GetTypeReal<T>::type;
660+
if (device_type == base_device::AbacusDevice_t::CpuDevice) {
661+
#ifdef _OPENMP
662+
#pragma omp parallel for schedule(static, 4096 / sizeof(Real))
663+
#endif
664+
for (int i = 0; i < dim; i++)
665+
{
666+
result[i] = vector1[i] * vector2[i];
667+
}
668+
}
669+
else if (device_type == base_device::AbacusDevice_t::GpuDevice){
670+
#ifdef __CUDA
671+
hsolver::vector_mul_vector_op<T, base_device::DEVICE_GPU>()(gpu_ctx, dim, result, vector1, vector2);
672+
#endif
673+
}
674+
}
675+
676+
677+
template <typename T>
678+
void vector_div_vector(const int& dim, T* result, const T* vector1, const T* vector2, base_device::AbacusDevice_t device_type){
679+
using Real = typename GetTypeReal<T>::type;
680+
if (device_type == base_device::AbacusDevice_t::CpuDevice) {
681+
#ifdef _OPENMP
682+
#pragma omp parallel for schedule(static, 4096 / sizeof(Real))
683+
#endif
684+
for (int i = 0; i < dim; i++)
685+
{
686+
result[i] = vector1[i] / vector2[i];
687+
}
688+
}
689+
else if (device_type == base_device::AbacusDevice_t::GpuDevice){
690+
#ifdef __CUDA
691+
hsolver::vector_div_vector_op<T, base_device::DEVICE_GPU>()(gpu_ctx, dim, result, vector1, vector2);
692+
#endif
693+
}
694+
}
695+
696+
void vector_add_vector(const int& dim, float *result, const float *vector1, const float constant1, const float *vector2, const float constant2, base_device::AbacusDevice_t device_type)
697+
{
698+
if (device_type == base_device::CpuDevice){
699+
#ifdef _OPENMP
700+
#pragma omp parallel for schedule(static, 8192 / sizeof(float))
701+
#endif
702+
for (int i = 0; i < dim; i++)
703+
{
704+
result[i] = vector1[i] * constant1 + vector2[i] * constant2;
705+
}
706+
}
707+
else if (device_type == base_device::GpuDevice){
708+
#ifdef __CUDA
709+
hsolver::constantvector_addORsub_constantVector_op<float, base_device::DEVICE_GPU>()(gpu_ctx, dim, result, vector1, constant1, vector2, constant2);
710+
#endif
711+
}
712+
}
713+
714+
void vector_add_vector(const int& dim, double *result, const double *vector1, const double constant1, const double *vector2, const double constant2, base_device::AbacusDevice_t device_type)
715+
{
716+
if (device_type == base_device::CpuDevice){
717+
#ifdef _OPENMP
718+
#pragma omp parallel for schedule(static, 8192 / sizeof(double))
719+
#endif
720+
for (int i = 0; i < dim; i++)
721+
{
722+
result[i] = vector1[i] * constant1 + vector2[i] * constant2;
723+
}
724+
}
725+
else if (device_type == base_device::GpuDevice){
726+
#ifdef __CUDA
727+
hsolver::constantvector_addORsub_constantVector_op<double, base_device::DEVICE_GPU>()(gpu_ctx, dim, result, vector1, constant1, vector2, constant2);
728+
#endif
729+
}
730+
}
731+
732+
void vector_add_vector(const int& dim, std::complex<float> *result, const std::complex<float> *vector1, const float constant1, const std::complex<float> *vector2, const float constant2, base_device::AbacusDevice_t device_type)
733+
{
734+
if (device_type == base_device::CpuDevice){
735+
#ifdef _OPENMP
736+
#pragma omp parallel for schedule(static, 8192 / sizeof(std::complex<float>))
737+
#endif
738+
for (int i = 0; i < dim; i++)
739+
{
740+
result[i] = vector1[i] * constant1 + vector2[i] * constant2;
741+
}
742+
}
743+
else if (device_type == base_device::GpuDevice){
744+
#ifdef __CUDA
745+
hsolver::constantvector_addORsub_constantVector_op<std::complex<float>, base_device::DEVICE_GPU>()(gpu_ctx, dim, result, vector1, constant1, vector2, constant2);
746+
#endif
747+
}
748+
}
749+
750+
void vector_add_vector(const int& dim, std::complex<double> *result, const std::complex<double> *vector1, const double constant1, const std::complex<double> *vector2, const double constant2, base_device::AbacusDevice_t device_type)
751+
{
752+
if (device_type == base_device::CpuDevice){
753+
#ifdef _OPENMP
754+
#pragma omp parallel for schedule(static, 8192 / sizeof(std::complex<double>))
755+
#endif
756+
for (int i = 0; i < dim; i++)
757+
{
758+
result[i] = vector1[i] * constant1 + vector2[i] * constant2;
759+
}
760+
}
761+
else if (device_type == base_device::GpuDevice){
762+
#ifdef __CUDA
763+
hsolver::constantvector_addORsub_constantVector_op<std::complex<double>, base_device::DEVICE_GPU>()(gpu_ctx, dim, result, vector1, constant1, vector2, constant2);
764+
#endif
765+
}
655766
}

source/module_base/blas_connector.h

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
#include <complex>
55
#include "module_base/module_device/types.h"
6+
#include "macros.h"
67

78
// These still need to be linked in the header file
89
// Because quite a lot of code will directly use the original cblas kernels.
@@ -303,9 +304,38 @@ class BlasConnector
303304
static
304305
void copy(const long n, const std::complex<double> *a, const int incx, std::complex<double> *b, const int incy, base_device::AbacusDevice_t device_type = base_device::AbacusDevice_t::CpuDevice);
305306

306-
// A is symmetric
307+
// There is some other operators needed, so implemented manually here
308+
template <typename T>
309+
static
310+
void vector_mul_vector(const int& dim, T* result, const T* vector1, const T* vector2, base_device::AbacusDevice_t device_type = base_device::AbacusDevice_t::CpuDevice);
311+
312+
template <typename T>
313+
static
314+
void vector_div_vector(const int& dim, T* result, const T* vector1, const T* vector2, base_device::AbacusDevice_t device_type = base_device::AbacusDevice_t::CpuDevice);
315+
316+
// y = alpha * x + beta * y
317+
static
318+
void vector_add_vector(const int& dim, float *result, const float *vector1, const float constant1, const float *vector2, const float constant2, base_device::AbacusDevice_t device_type = base_device::AbacusDevice_t::CpuDevice);
319+
320+
static
321+
void vector_add_vector(const int& dim, double *result, const double *vector1, const double constant1, const double *vector2, const double constant2, base_device::AbacusDevice_t device_type = base_device::AbacusDevice_t::CpuDevice);
322+
323+
static
324+
void vector_add_vector(const int& dim, std::complex<float> *result, const std::complex<float> *vector1, const float constant1, const std::complex<float> *vector2, const float constant2, base_device::AbacusDevice_t device_type = base_device::AbacusDevice_t::CpuDevice);
325+
326+
static
327+
void vector_add_vector(const int& dim, std::complex<double> *result, const std::complex<double> *vector1, const double constant1, const std::complex<double> *vector2, const double constant2, base_device::AbacusDevice_t device_type = base_device::AbacusDevice_t::CpuDevice);
307328
};
308329

330+
#ifdef __CUDA
331+
332+
namespace BlasUtils{
333+
void createGpuBlasHandle();
334+
void destoryBLAShandle();
335+
}
336+
337+
#endif
338+
309339
// If GATHER_INFO is defined, the original function is replaced with a "i" suffix,
310340
// preventing changes on the original code.
311341
// The real function call is at gather_math_lib_info.cpp

source/module_base/kernels/cuda/math_op.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#include "cuda_runtime.h"
1+
#include <cuda_runtime.h>
22
#include "module_base/kernels/math_op.h"
33

44
#include <base/macros/macros.h>

0 commit comments

Comments
 (0)