11#include " blas_connector.h"
2+ #include " macros.h"
23
34#ifdef __DSP
45#include " module_base/kernels/dsp/dsp_connector.h"
89#ifdef __CUDA
910#include < base/macros/macros.h>
1011#include < cuda_runtime.h>
11- #include < thrust/complex.h>
12- #include < thrust/execution_policy.h>
13- #include < thrust/inner_product.h>
14- #include " module_base/tool_quit.h"
15-
1612#include " cublas_v2.h"
13+ #include " module_hsolver/kernels/math_kernel_op.h"
14+ #include " module_base/module_device/memory_op.h"
15+
1716
1817namespace BlasUtils {
1918
@@ -652,4 +651,116 @@ void BlasConnector::copy(const long n, const std::complex<double> *a, const int
652651 if (device_type == base_device::AbacusDevice_t::CpuDevice) {
653652 zcopy_ (&n, a, &incx, b, &incy);
654653 }
654+ }
655+
656+
657+ template <typename T>
658+ void vector_mul_vector (const int & dim, T* result, const T* vector1, const T* vector2, base_device::AbacusDevice_t device_type){
659+ using Real = typename GetTypeReal<T>::type;
660+ if (device_type == base_device::AbacusDevice_t::CpuDevice) {
661+ #ifdef _OPENMP
662+ #pragma omp parallel for schedule(static, 4096 / sizeof(Real))
663+ #endif
664+ for (int i = 0 ; i < dim; i++)
665+ {
666+ result[i] = vector1[i] * vector2[i];
667+ }
668+ }
669+ else if (device_type == base_device::AbacusDevice_t::GpuDevice){
670+ #ifdef __CUDA
671+ hsolver::vector_mul_vector_op<T, base_device::DEVICE_GPU>()(gpu_ctx, dim, result, vector1, vector2);
672+ #endif
673+ }
674+ }
675+
676+
677+ template <typename T>
678+ void vector_div_vector (const int & dim, T* result, const T* vector1, const T* vector2, base_device::AbacusDevice_t device_type){
679+ using Real = typename GetTypeReal<T>::type;
680+ if (device_type == base_device::AbacusDevice_t::CpuDevice) {
681+ #ifdef _OPENMP
682+ #pragma omp parallel for schedule(static, 4096 / sizeof(Real))
683+ #endif
684+ for (int i = 0 ; i < dim; i++)
685+ {
686+ result[i] = vector1[i] / vector2[i];
687+ }
688+ }
689+ else if (device_type == base_device::AbacusDevice_t::GpuDevice){
690+ #ifdef __CUDA
691+ hsolver::vector_div_vector_op<T, base_device::DEVICE_GPU>()(gpu_ctx, dim, result, vector1, vector2);
692+ #endif
693+ }
694+ }
695+
696+ void vector_add_vector (const int & dim, float *result, const float *vector1, const float constant1, const float *vector2, const float constant2, base_device::AbacusDevice_t device_type)
697+ {
698+ if (device_type == base_device::CpuDevice){
699+ #ifdef _OPENMP
700+ #pragma omp parallel for schedule(static, 8192 / sizeof(float))
701+ #endif
702+ for (int i = 0 ; i < dim; i++)
703+ {
704+ result[i] = vector1[i] * constant1 + vector2[i] * constant2;
705+ }
706+ }
707+ else if (device_type == base_device::GpuDevice){
708+ #ifdef __CUDA
709+ hsolver::constantvector_addORsub_constantVector_op<float , base_device::DEVICE_GPU>()(gpu_ctx, dim, result, vector1, constant1, vector2, constant2);
710+ #endif
711+ }
712+ }
713+
714+ void vector_add_vector (const int & dim, double *result, const double *vector1, const double constant1, const double *vector2, const double constant2, base_device::AbacusDevice_t device_type)
715+ {
716+ if (device_type == base_device::CpuDevice){
717+ #ifdef _OPENMP
718+ #pragma omp parallel for schedule(static, 8192 / sizeof(double))
719+ #endif
720+ for (int i = 0 ; i < dim; i++)
721+ {
722+ result[i] = vector1[i] * constant1 + vector2[i] * constant2;
723+ }
724+ }
725+ else if (device_type == base_device::GpuDevice){
726+ #ifdef __CUDA
727+ hsolver::constantvector_addORsub_constantVector_op<double , base_device::DEVICE_GPU>()(gpu_ctx, dim, result, vector1, constant1, vector2, constant2);
728+ #endif
729+ }
730+ }
731+
732+ void vector_add_vector (const int & dim, std::complex <float > *result, const std::complex <float > *vector1, const float constant1, const std::complex <float > *vector2, const float constant2, base_device::AbacusDevice_t device_type)
733+ {
734+ if (device_type == base_device::CpuDevice){
735+ #ifdef _OPENMP
736+ #pragma omp parallel for schedule(static, 8192 / sizeof(std::complex<float>))
737+ #endif
738+ for (int i = 0 ; i < dim; i++)
739+ {
740+ result[i] = vector1[i] * constant1 + vector2[i] * constant2;
741+ }
742+ }
743+ else if (device_type == base_device::GpuDevice){
744+ #ifdef __CUDA
745+ hsolver::constantvector_addORsub_constantVector_op<std::complex <float >, base_device::DEVICE_GPU>()(gpu_ctx, dim, result, vector1, constant1, vector2, constant2);
746+ #endif
747+ }
748+ }
749+
750+ void vector_add_vector (const int & dim, std::complex <double > *result, const std::complex <double > *vector1, const double constant1, const std::complex <double > *vector2, const double constant2, base_device::AbacusDevice_t device_type)
751+ {
752+ if (device_type == base_device::CpuDevice){
753+ #ifdef _OPENMP
754+ #pragma omp parallel for schedule(static, 8192 / sizeof(std::complex<double>))
755+ #endif
756+ for (int i = 0 ; i < dim; i++)
757+ {
758+ result[i] = vector1[i] * constant1 + vector2[i] * constant2;
759+ }
760+ }
761+ else if (device_type == base_device::GpuDevice){
762+ #ifdef __CUDA
763+ hsolver::constantvector_addORsub_constantVector_op<std::complex <double >, base_device::DEVICE_GPU>()(gpu_ctx, dim, result, vector1, constant1, vector2, constant2);
764+ #endif
765+ }
655766}
0 commit comments