diff --git a/source/module_base/blas_connector.cpp b/source/module_base/blas_connector.cpp
index 106eb6c4e8..14fb76e2ed 100644
--- a/source/module_base/blas_connector.cpp
+++ b/source/module_base/blas_connector.cpp
@@ -1,4 +1,5 @@
#include "blas_connector.h"
+#include "macros.h"
#ifdef __DSP
#include "module_base/kernels/dsp/dsp_connector.h"
@@ -8,12 +9,10 @@
#ifdef __CUDA
#include
#include
-#include
-#include
-#include
-#include "module_base/tool_quit.h"
-
#include "cublas_v2.h"
+#include "module_hsolver/kernels/math_kernel_op.h"
+#include "module_base/module_device/memory_op.h"
+
namespace BlasUtils{
@@ -652,4 +651,116 @@ void BlasConnector::copy(const long n, const std::complex *a, const int
if (device_type == base_device::AbacusDevice_t::CpuDevice) {
zcopy_(&n, a, &incx, b, &incy);
}
+}
+
+
+template
+void vector_mul_vector(const int& dim, T* result, const T* vector1, const T* vector2, base_device::AbacusDevice_t device_type){
+ using Real = typename GetTypeReal::type;
+ if (device_type == base_device::AbacusDevice_t::CpuDevice) {
+#ifdef _OPENMP
+#pragma omp parallel for schedule(static, 4096 / sizeof(Real))
+#endif
+ for (int i = 0; i < dim; i++)
+ {
+ result[i] = vector1[i] * vector2[i];
+ }
+ }
+ else if (device_type == base_device::AbacusDevice_t::GpuDevice){
+#ifdef __CUDA
+ hsolver::vector_mul_vector_op()(gpu_ctx, dim, result, vector1, vector2);
+#endif
+ }
+}
+
+
+template
+void vector_div_vector(const int& dim, T* result, const T* vector1, const T* vector2, base_device::AbacusDevice_t device_type){
+ using Real = typename GetTypeReal::type;
+ if (device_type == base_device::AbacusDevice_t::CpuDevice) {
+#ifdef _OPENMP
+#pragma omp parallel for schedule(static, 4096 / sizeof(Real))
+#endif
+ for (int i = 0; i < dim; i++)
+ {
+ result[i] = vector1[i] / vector2[i];
+ }
+ }
+ else if (device_type == base_device::AbacusDevice_t::GpuDevice){
+#ifdef __CUDA
+ hsolver::vector_div_vector_op()(gpu_ctx, dim, result, vector1, vector2);
+#endif
+ }
+}
+
+void vector_add_vector(const int& dim, float *result, const float *vector1, const float constant1, const float *vector2, const float constant2, base_device::AbacusDevice_t device_type)
+{
+ if (device_type == base_device::CpuDevice){
+#ifdef _OPENMP
+#pragma omp parallel for schedule(static, 8192 / sizeof(float))
+#endif
+ for (int i = 0; i < dim; i++)
+ {
+ result[i] = vector1[i] * constant1 + vector2[i] * constant2;
+ }
+ }
+ else if (device_type == base_device::GpuDevice){
+#ifdef __CUDA
+ hsolver::constantvector_addORsub_constantVector_op()(gpu_ctx, dim, result, vector1, constant1, vector2, constant2);
+#endif
+ }
+}
+
+void vector_add_vector(const int& dim, double *result, const double *vector1, const double constant1, const double *vector2, const double constant2, base_device::AbacusDevice_t device_type)
+{
+ if (device_type == base_device::CpuDevice){
+#ifdef _OPENMP
+#pragma omp parallel for schedule(static, 8192 / sizeof(double))
+#endif
+ for (int i = 0; i < dim; i++)
+ {
+ result[i] = vector1[i] * constant1 + vector2[i] * constant2;
+ }
+ }
+ else if (device_type == base_device::GpuDevice){
+#ifdef __CUDA
+ hsolver::constantvector_addORsub_constantVector_op()(gpu_ctx, dim, result, vector1, constant1, vector2, constant2);
+#endif
+ }
+}
+
+void vector_add_vector(const int& dim, std::complex *result, const std::complex *vector1, const float constant1, const std::complex *vector2, const float constant2, base_device::AbacusDevice_t device_type)
+{
+ if (device_type == base_device::CpuDevice){
+#ifdef _OPENMP
+#pragma omp parallel for schedule(static, 8192 / sizeof(std::complex))
+#endif
+ for (int i = 0; i < dim; i++)
+ {
+ result[i] = vector1[i] * constant1 + vector2[i] * constant2;
+ }
+ }
+ else if (device_type == base_device::GpuDevice){
+#ifdef __CUDA
+ hsolver::constantvector_addORsub_constantVector_op, base_device::DEVICE_GPU>()(gpu_ctx, dim, result, vector1, constant1, vector2, constant2);
+#endif
+ }
+}
+
+void vector_add_vector(const int& dim, std::complex *result, const std::complex *vector1, const double constant1, const std::complex *vector2, const double constant2, base_device::AbacusDevice_t device_type)
+{
+ if (device_type == base_device::CpuDevice){
+#ifdef _OPENMP
+#pragma omp parallel for schedule(static, 8192 / sizeof(std::complex))
+#endif
+ for (int i = 0; i < dim; i++)
+ {
+ result[i] = vector1[i] * constant1 + vector2[i] * constant2;
+ }
+ }
+ else if (device_type == base_device::GpuDevice){
+#ifdef __CUDA
+ hsolver::constantvector_addORsub_constantVector_op, base_device::DEVICE_GPU>()(gpu_ctx, dim, result, vector1, constant1, vector2, constant2);
+#endif
+ }
}
\ No newline at end of file
diff --git a/source/module_base/blas_connector.h b/source/module_base/blas_connector.h
index 7675429520..387ff32a46 100644
--- a/source/module_base/blas_connector.h
+++ b/source/module_base/blas_connector.h
@@ -3,6 +3,7 @@
#include
#include "module_base/module_device/types.h"
+#include "macros.h"
// These still need to be linked in the header file
// Because quite a lot of code will directly use the original cblas kernels.
@@ -303,9 +304,38 @@ class BlasConnector
static
void copy(const long n, const std::complex *a, const int incx, std::complex *b, const int incy, base_device::AbacusDevice_t device_type = base_device::AbacusDevice_t::CpuDevice);
- // A is symmetric
+ // There is some other operators needed, so implemented manually here
+ template
+ static
+ void vector_mul_vector(const int& dim, T* result, const T* vector1, const T* vector2, base_device::AbacusDevice_t device_type = base_device::AbacusDevice_t::CpuDevice);
+
+ template
+ static
+ void vector_div_vector(const int& dim, T* result, const T* vector1, const T* vector2, base_device::AbacusDevice_t device_type = base_device::AbacusDevice_t::CpuDevice);
+
+ // y = alpha * x + beta * y
+ static
+ void vector_add_vector(const int& dim, float *result, const float *vector1, const float constant1, const float *vector2, const float constant2, base_device::AbacusDevice_t device_type = base_device::AbacusDevice_t::CpuDevice);
+
+ static
+ void vector_add_vector(const int& dim, double *result, const double *vector1, const double constant1, const double *vector2, const double constant2, base_device::AbacusDevice_t device_type = base_device::AbacusDevice_t::CpuDevice);
+
+ static
+ void vector_add_vector(const int& dim, std::complex *result, const std::complex *vector1, const float constant1, const std::complex *vector2, const float constant2, base_device::AbacusDevice_t device_type = base_device::AbacusDevice_t::CpuDevice);
+
+ static
+ void vector_add_vector(const int& dim, std::complex *result, const std::complex *vector1, const double constant1, const std::complex *vector2, const double constant2, base_device::AbacusDevice_t device_type = base_device::AbacusDevice_t::CpuDevice);
};
+#ifdef __CUDA
+
+namespace BlasUtils{
+ void createGpuBlasHandle();
+ void destoryBLAShandle();
+}
+
+#endif
+
// If GATHER_INFO is defined, the original function is replaced with a "i" suffix,
// preventing changes on the original code.
// The real function call is at gather_math_lib_info.cpp
diff --git a/source/module_base/kernels/cuda/math_op.cu b/source/module_base/kernels/cuda/math_op.cu
index cee820414e..cef3c04f3f 100644
--- a/source/module_base/kernels/cuda/math_op.cu
+++ b/source/module_base/kernels/cuda/math_op.cu
@@ -1,4 +1,4 @@
-#include "cuda_runtime.h"
+#include
#include "module_base/kernels/math_op.h"
#include
diff --git a/source/module_base/test/blas_connector_test.cpp b/source/module_base/test/blas_connector_test.cpp
index cf445ecb7a..34f4cb51bb 100644
--- a/source/module_base/test/blas_connector_test.cpp
+++ b/source/module_base/test/blas_connector_test.cpp
@@ -1,4 +1,5 @@
#include "../blas_connector.h"
+#include "../module_device/memory_op.h"
#include "gtest/gtest.h"
#include
@@ -84,8 +85,7 @@ TEST(blas_connector, Scal) {
};
for (int i = 0; i < size; i++)
answer[i] = result[i] * scale;
- BlasConnector bs;
- bs.scal(size,scale,result,incx);
+ BlasConnector::scal(size,scale,result,incx);
// incx is the spacing between elements if result
for (int i = 0; i < size; i++) {
EXPECT_DOUBLE_EQ(answer[i].real(), result[i].real());
@@ -93,6 +93,33 @@ TEST(blas_connector, Scal) {
}
}
+#ifdef __CUDA
+
+TEST(blas_connector, ScalGpu) {
+ const int size = 8;
+ const std::complex scale = {2, 3};
+ const int incx = 1;
+ std::complex result[8], answer[8];
+ std::complex* result_gpu = nullptr;
+ resmem_zd_op()(gpu_ctx, result_gpu, 8 * sizeof(std::complex));
+ for (int i=0; i< size; i++) {
+ result[i] = std::complex{static_cast(std::rand() / double(RAND_MAX)),
+ static_cast(std::rand() / double(RAND_MAX))};
+ };
+ for (int i = 0; i < size; i++)
+ answer[i] = result[i] * scale;
+ syncmem_z2z_h2d_op()(gpu_ctx, cpu_ctx, result_gpu, result, sizeof(std::complex) * 8);
+ BlasConnector::scal(size,scale,result_gpu,incx,base_device::AbacusDevice_t::GpuDevice);
+ syncmem_z2z_d2h_op()(cpu_ctx, gpu_ctx, result, result_gpu, sizeof(std::complex) * 8);
+ delmem_zd_op()(gpu_ctx, result_gpu);
+ // incx is the spacing between elements if result
+ for (int i = 0; i < size; i++) {
+ EXPECT_DOUBLE_EQ(answer[i].real(), result[i].real());
+ EXPECT_DOUBLE_EQ(answer[i].imag(), result[i].imag());
+ }
+}
+
+#endif
TEST(blas_connector, daxpy_) {
typedef double T;
@@ -136,6 +163,67 @@ TEST(blas_connector, zaxpy_) {
}
}
+TEST(blas_connector, Axpy) {
+ typedef std::complex T;
+ const int size = 8;
+ const T scale = {2, 3};
+ const int incx = 1;
+ const int incy = 1;
+ std::array x_const, result, answer;
+ std::generate(x_const.begin(), x_const.end(), []() {
+ return T{static_cast(std::rand() / double(RAND_MAX)),
+ static_cast(std::rand() / double(RAND_MAX))};
+ });
+ std::generate(result.begin(), result.end(), []() {
+ return T{static_cast(std::rand() / double(RAND_MAX)),
+ static_cast(std::rand() / double(RAND_MAX))};
+ });
+ for (int i = 0; i < size; i++)
+ answer[i] = x_const[i] * scale + result[i];
+ BlasConnector::axpy(size, scale, x_const.data(), incx, result.data(), incy);
+ for (int i = 0; i < size; i++) {
+ EXPECT_DOUBLE_EQ(answer[i].real(), result[i].real());
+ EXPECT_DOUBLE_EQ(answer[i].imag(), result[i].imag());
+ }
+}
+
+#ifdef __CUDA
+
+TEST(blas_connector, AxpyGpu) {
+ typedef std::complex T;
+ const int size = 8;
+ const T scale = {2, 3};
+ const int incx = 1;
+ const int incy = 1;
+ std::array x_const, result, answer;
+ T* x_gpu = nullptr;
+ T* result_gpu = nullptr;
+ resmem_zd_op()(gpu_ctx, x_gpu, size * sizeof(std::complex));
+ resmem_zd_op()(gpu_ctx, result_gpu, size * sizeof(std::complex));
+ std::generate(x_const.begin(), x_const.end(), []() {
+ return T{static_cast(std::rand() / double(RAND_MAX)),
+ static_cast(std::rand() / double(RAND_MAX))};
+ });
+ std::generate(result.begin(), result.end(), []() {
+ return T{static_cast(std::rand() / double(RAND_MAX)),
+ static_cast(std::rand() / double(RAND_MAX))};
+ });
+ for (int i = 0; i < size; i++)
+ answer[i] = x_const[i] * scale + result[i];
+ syncmem_z2z_h2d_op()(gpu_ctx, cpu_ctx, result_gpu, result.data(), sizeof(std::complex) * size);
+ syncmem_z2z_h2d_op()(gpu_ctx, cpu_ctx, x_gpu, x_const.data(), sizeof(std::complex) * size);
+ BlasConnector::axpy(size, scale, x_gpu, incx, result_gpu, incy, base_device::AbacusDevice_t::GpuDevice);
+ syncmem_z2z_d2h_op()(cpu_ctx, gpu_ctx, result.data(), result_gpu, sizeof(std::complex) * size);
+ delmem_zd_op()(gpu_ctx, result_gpu);
+ delmem_zd_op()(gpu_ctx, x_gpu);
+ for (int i = 0; i < size; i++) {
+ EXPECT_DOUBLE_EQ(answer[i].real(), result[i].real());
+ EXPECT_DOUBLE_EQ(answer[i].imag(), result[i].imag());
+ }
+}
+
+#endif
+
TEST(blas_connector, dcopy_) {
typedef double T;
long const size = 8;
@@ -313,6 +401,84 @@ TEST(blas_connector, zgemv_) {
}
}
+TEST(blas_connector, Gemv) {
+ typedef std::complex T;
+ const char transa_m = 'N';
+ const char transa_n = 'T';
+ const char transa_h = 'C';
+ const int size_m = 3;
+ const int size_n = 4;
+ const T alpha_const = {2, 3};
+ const T beta_const = {3, 4};
+ const int lda = 5;
+ const int incx = 1;
+ const int incy = 1;
+ std::array x_const_m, x_const_c, result_m, answer_m, c_dot_m{};
+ std::array x_const_n, result_n, result_c, answer_n, answer_c,
+ c_dot_n{}, c_dot_c{};
+ std::generate(x_const_n.begin(), x_const_n.end(), []() {
+ return T{static_cast(std::rand() / double(RAND_MAX)),
+ static_cast(std::rand() / double(RAND_MAX))};
+ });
+ std::generate(result_n.begin(), result_n.end(), []() {
+ return T{static_cast(std::rand() / double(RAND_MAX)),
+ static_cast(std::rand() / double(RAND_MAX))};
+ });
+ std::generate(x_const_m.begin(), x_const_m.end(), []() {
+ return T{static_cast(std::rand() / double(RAND_MAX)),
+ static_cast(std::rand() / double(RAND_MAX))};
+ });
+ std::generate(result_m.begin(), result_m.end(), []() {
+ return T{static_cast(std::rand() / double(RAND_MAX)),
+ static_cast(std::rand() / double(RAND_MAX))};
+ });
+ std::array a_const;
+ std::generate(a_const.begin(), a_const.end(), []() {
+ return T{static_cast(std::rand() / double(RAND_MAX)),
+ static_cast(std::rand() / double(RAND_MAX))};
+ });
+ for (int i = 0; i < size_m; i++) {
+ for (int j = 0; j < size_n; j++) {
+ c_dot_m[i] += a_const[i + j * lda] * x_const_n[j];
+ }
+ answer_m[i] = alpha_const * c_dot_m[i] + beta_const * result_m[i];
+ }
+ BlasConnector::gemv(transa_m, size_m, size_n, alpha_const, a_const.data(), lda,
+ x_const_n.data(), incx, beta_const, result_m.data(), incy);
+
+ for (int j = 0; j < size_n; j++) {
+ for (int i = 0; i < size_m; i++) {
+ c_dot_n[j] += a_const[i + j * lda] * x_const_m[i];
+ }
+ answer_n[j] = alpha_const * c_dot_n[j] + beta_const * result_n[j];
+ }
+ BlasConnector::gemv(transa_n, size_m, size_n, alpha_const, a_const.data(), lda,
+ x_const_m.data(), incx, beta_const, result_n.data(), incy);
+
+ for (int j = 0; j < size_n; j++) {
+ for (int i = 0; i < size_m; i++) {
+ c_dot_c[j] += conj(a_const[i + j * lda]) * x_const_c[i];
+ }
+ answer_c[j] = alpha_const * c_dot_c[j] + beta_const * result_c[j];
+ }
+ BlasConnector::gemv(transa_h, size_m, size_n, alpha_const, a_const.data(), lda,
+ x_const_c.data(), incx, beta_const, result_c.data(), incy);
+
+ for (int i = 0; i < size_m; i++) {
+ EXPECT_DOUBLE_EQ(answer_m[i].real(), result_m[i].real());
+ EXPECT_DOUBLE_EQ(answer_m[i].imag(), result_m[i].imag());
+ }
+ for (int j = 0; j < size_n; j++) {
+ EXPECT_DOUBLE_EQ(answer_n[j].real(), result_n[j].real());
+ EXPECT_DOUBLE_EQ(answer_n[j].imag(), result_n[j].imag());
+ }
+ for (int j = 0; j < size_n; j++) {
+ EXPECT_DOUBLE_EQ(answer_c[j].real(), result_c[j].real());
+ EXPECT_DOUBLE_EQ(answer_c[j].imag(), result_c[j].imag());
+ }
+}
+
+
TEST(blas_connector, dgemm_) {
typedef double T;
const char transa_m = 'N';
@@ -404,7 +570,128 @@ TEST(blas_connector, zgemm_) {
}
}
+TEST(blas_connector, Gemm) {
+ typedef std::complex T;
+ const char transa_m = 'N';
+ const char transb_m = 'N';
+ const int size_m = 3;
+ const int size_n = 4;
+ const int size_k = 5;
+ const T alpha_const = {2, 3};
+ const T beta_const = {3, 4};
+ const int lda = 6;
+ const int ldb = 5;
+ const int ldc = 4;
+ std::array a_const;
+ std::array b_const;
+ std::array c_dot{}, answer, result;
+ std::generate(a_const.begin(), a_const.end(), []() {
+ return T{static_cast(std::rand() / double(RAND_MAX)),
+ static_cast(std::rand() / double(RAND_MAX))};
+ });
+ std::generate(b_const.begin(), b_const.end(), []() {
+ return T{static_cast(std::rand() / double(RAND_MAX)),
+ static_cast(std::rand() / double(RAND_MAX))};
+ });
+ std::generate(result.begin(), result.end(), []() {
+ return T{static_cast(std::rand() / double(RAND_MAX)),
+ static_cast(std::rand() / double(RAND_MAX))};
+ });
+ for (int i = 0; i < size_m; i++) {
+ for (int j = 0; j < size_n; j++) {
+ for (int k = 0; k < size_k; k++) {
+ c_dot[i + j * ldc] +=
+ a_const[i + k * lda] * b_const[k + j * ldb];
+ }
+ answer[i + j * ldc] = alpha_const * c_dot[i + j * ldc] +
+ beta_const * result[i + j * ldc];
+ }
+ }
+ BlasConnector::gemm_cm(transa_m, transb_m, size_m, size_n, size_k, alpha_const,
+ a_const.data(), lda, b_const.data(), ldb, beta_const,
+ result.data(), ldc);
+
+ for (int i = 0; i < size_m; i++)
+ for (int j = 0; j < size_n; j++) {
+ EXPECT_DOUBLE_EQ(answer[i + j * ldc].real(),
+ result[i + j * ldc].real());
+ EXPECT_DOUBLE_EQ(answer[i + j * ldc].imag(),
+ result[i + j * ldc].imag());
+ }
+}
+
+#ifdef __CUDA
+
+TEST(blas_connector, GemmGpu) {
+ typedef std::complex T;
+ const char transa_m = 'N';
+ const char transb_m = 'N';
+ const int size_m = 3;
+ const int size_n = 4;
+ const int size_k = 5;
+ const T alpha_const = {2, 3};
+ const T beta_const = {3, 4};
+ const int lda = 6;
+ const int ldb = 5;
+ const int ldc = 4;
+ std::array a_const;
+ std::array b_const;
+ std::array c_dot{}, answer, result;
+ std::complex* a_gpu = nullptr;
+ std::complex* b_gpu = nullptr;
+ std::complex* result_gpu = nullptr;
+ resmem_zd_op()(gpu_ctx, a_gpu, size_k * lda * sizeof(std::complex));
+ resmem_zd_op()(gpu_ctx, b_gpu, size_n * ldb * sizeof(std::complex));
+ resmem_zd_op()(gpu_ctx, result_gpu, size_n * ldc * sizeof(std::complex));
+ std::generate(a_const.begin(), a_const.end(), []() {
+ return T{static_cast(std::rand() / double(RAND_MAX)),
+ static_cast(std::rand() / double(RAND_MAX))};
+ });
+ std::generate(b_const.begin(), b_const.end(), []() {
+ return T{static_cast(std::rand() / double(RAND_MAX)),
+ static_cast(std::rand() / double(RAND_MAX))};
+ });
+ std::generate(result.begin(), result.end(), []() {
+ return T{static_cast(std::rand() / double(RAND_MAX)),
+ static_cast(std::rand() / double(RAND_MAX))};
+ });
+ for (int i = 0; i < size_m; i++) {
+ for (int j = 0; j < size_n; j++) {
+ for (int k = 0; k < size_k; k++) {
+ c_dot[i + j * ldc] +=
+ a_const[i + k * lda] * b_const[k + j * ldb];
+ }
+ answer[i + j * ldc] = alpha_const * c_dot[i + j * ldc] +
+ beta_const * result[i + j * ldc];
+ }
+ }
+ syncmem_z2z_h2d_op()(gpu_ctx, cpu_ctx, a_gpu, a_const.data(), sizeof(std::complex) * size_k * lda);
+ syncmem_z2z_h2d_op()(gpu_ctx, cpu_ctx, b_gpu, b_const.data(), sizeof(std::complex) * size_n * ldb);
+ syncmem_z2z_h2d_op()(gpu_ctx, cpu_ctx, result_gpu, result.data(), sizeof(std::complex) * size_n * ldc);
+ BlasConnector::gemm_cm(transa_m, transb_m, size_m, size_n, size_k, alpha_const,
+ a_gpu, lda, b_gpu, ldb, beta_const,
+ result_gpu, ldc, base_device::AbacusDevice_t::GpuDevice);
+ syncmem_z2z_d2h_op()(cpu_ctx, gpu_ctx, result.data(), result_gpu, sizeof(std::complex) * size_n * ldc);
+ delmem_zd_op()(gpu_ctx, result_gpu);
+ delmem_zd_op()(gpu_ctx, a_gpu);
+ delmem_zd_op()(gpu_ctx, b_gpu);
+ for (int i = 0; i < size_m; i++)
+ for (int j = 0; j < size_n; j++) {
+ EXPECT_DOUBLE_EQ(answer[i + j * ldc].real(),
+ result[i + j * ldc].real());
+ EXPECT_DOUBLE_EQ(answer[i + j * ldc].imag(),
+ result[i + j * ldc].imag());
+ }
+}
+
+#endif
+
int main(int argc, char **argv) {
+#ifdef __CUDA
+ std::cout << "Initializing CublasHandle..." << std::endl;
+ BlasUtils::createGpuBlasHandle();
+ std::cout << "Initializing CublasHandle Done." << std::endl;
+#endif
testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}
diff --git a/source/module_hsolver/kernels/cuda/math_kernel_op.cu b/source/module_hsolver/kernels/cuda/math_kernel_op.cu
index 149b9ce389..70ed5ebf0b 100644
--- a/source/module_hsolver/kernels/cuda/math_kernel_op.cu
+++ b/source/module_hsolver/kernels/cuda/math_kernel_op.cu
@@ -1043,6 +1043,7 @@ template struct line_minimize_with_block_op, base_device::DE
template struct vector_div_constant_op, base_device::DEVICE_GPU>;
template struct vector_mul_vector_op, base_device::DEVICE_GPU>;
template struct vector_div_vector_op, base_device::DEVICE_GPU>;
+template struct constantvector_addORsub_constantVector_op;
template struct constantvector_addORsub_constantVector_op, base_device::DEVICE_GPU>;
template struct matrixSetToAnother, base_device::DEVICE_GPU>;