Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 43 additions & 1 deletion source/source_base/kernels/cuda/math_kernel_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,14 @@ __global__ void matrix_copy_kernel(const int n1, const int n2, const T* A, const
}
}

template <typename T, typename Real>
__global__ void matrix_multiply_vector_kernel(const int m, const int n, T *a, const int lda, const Real *b, const Real alpha, T *c, const int ldc){
int row = blockIdx.x * blockDim.x + threadIdx.x;
int col = blockIdx.y * blockDim.y + threadIdx.y;
if (col >= n || row >= m) return;
c[col * ldc + row] = a[col * lda + row] * b[col] * alpha;
}

cublasOperation_t judge_trans_op(bool is_complex, const char& trans, const char* name)
{
if (trans == 'N')
Expand All @@ -147,7 +155,7 @@ cublasOperation_t judge_trans_op(bool is_complex, const char& trans, const char*
{
return CUBLAS_OP_C;
}
else
else
{
ModuleBase::WARNING_QUIT(name, std::string("Unknown trans type ") + trans + std::string(" !"));
}
Expand Down Expand Up @@ -438,10 +446,44 @@ void matrixCopy<std::complex<double>, base_device::DEVICE_GPU>::operator()(const
cudaCheckOnDebug();
}

template <>
void matrix_mul_vector_op<double, base_device::DEVICE_GPU>::operator()(const int &m, const int &n,
double *a, const int &lda, const double *b, const double alpha, double *c, const int &ldc){
dim3 thread(16, 16, 1);
dim3 block((m + thread.x - 1) / thread.x, (n + thread.y - 1) / thread.y, 1);
matrix_multiply_vector_kernel<double, double> <<<block, thread >>>(m, n, a, lda,
b, alpha, c, ldc);
cudaCheckOnDebug();
}

template <>
void matrix_mul_vector_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int &m, const int &n,
std::complex<float> *a, const int &lda, const float *b, const float alpha, std::complex<float> *c, const int &ldc){
dim3 thread(16, 16, 1);
dim3 block((m + thread.x - 1) / thread.x, (n + thread.y - 1) / thread.y, 1);
matrix_multiply_vector_kernel<thrust::complex<float>, float> <<<block, thread >>>(m, n, reinterpret_cast<thrust::complex<float>*>(a), lda,
b, alpha, reinterpret_cast<thrust::complex<float>*>(c), ldc);
cudaCheckOnDebug();
}

template <>
void matrix_mul_vector_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(const int &m, const int &n,
std::complex<double> *a, const int &lda, const double *b, const double alpha, std::complex<double> *c, const int &ldc)
{
dim3 thread(16, 16, 1);
dim3 block((m + thread.x - 1) / thread.x, (n + thread.y - 1) / thread.y, 1);
matrix_multiply_vector_kernel<thrust::complex<double>, double> <<<block, thread >>>(m, n, reinterpret_cast<thrust::complex<double>*>(a), lda,
b, alpha, reinterpret_cast<thrust::complex<double>*>(c), ldc);
cudaCheckOnDebug();
}

// Explicitly instantiate functors for the types of functor registered.

template struct matrixCopy<std::complex<float>, base_device::DEVICE_GPU>;
template struct matrixCopy<double, base_device::DEVICE_GPU>;
template struct matrixCopy<std::complex<double>, base_device::DEVICE_GPU>;

template struct matrix_mul_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
template struct matrix_mul_vector_op<double, base_device::DEVICE_GPU>;
template struct matrix_mul_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
} // namespace ModuleBase
25 changes: 25 additions & 0 deletions source/source_base/kernels/math_kernel_op.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -119,12 +119,35 @@ struct matrixCopy<T, base_device::DEVICE_CPU>
}
};

template <typename T>
struct matrix_mul_vector_op<T, base_device::DEVICE_CPU> {
using Real = typename GetTypeReal<T>::type;
void operator()(const int& m, const int &n,
T *a,
const int &lda,
const Real *b,
const Real alpha,
T *c,
const int &ldc){
#ifdef _OPENMP
#pragma omp parallel for collapse(2) schedule(static, 8192 / sizeof(T))
#endif
for (int j = 0; j < n; j++){
for (int i = 0; i < m; i++){
c[j * ldc + i] = a[j * lda + i] * b[j] * alpha;
}
}

}
};

template struct gemv_op<std::complex<float>, base_device::DEVICE_CPU>;
template struct gemv_op<float, base_device::DEVICE_CPU>;
template struct gemm_op<std::complex<float>, base_device::DEVICE_CPU>;
template struct gemm_op<float, base_device::DEVICE_CPU>;
template struct matrixTranspose_op<std::complex<float>, base_device::DEVICE_CPU>;
template struct matrixCopy<std::complex<float>, base_device::DEVICE_CPU>;
template struct matrix_mul_vector_op<std::complex<float>, base_device::DEVICE_CPU>;

template struct gemv_op<std::complex<double>, base_device::DEVICE_CPU>;
template struct gemv_op<double, base_device::DEVICE_CPU>;
Expand All @@ -133,6 +156,8 @@ template struct gemm_op<double, base_device::DEVICE_CPU>;
template struct matrixTranspose_op<std::complex<double>, base_device::DEVICE_CPU>;
template struct matrixCopy<double, base_device::DEVICE_CPU>;
template struct matrixCopy<std::complex<double>, base_device::DEVICE_CPU>;
template struct matrix_mul_vector_op<double, base_device::DEVICE_CPU>;
template struct matrix_mul_vector_op<std::complex<double>, base_device::DEVICE_CPU>;

#ifdef __LCAO
template struct matrixTranspose_op<double, base_device::DEVICE_CPU>;
Expand Down
40 changes: 38 additions & 2 deletions source/source_base/kernels/math_kernel_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ template <typename T, typename Device> struct vector_div_constant_op {
///
/// Input Parameters
/// \param dim : array size
/// \param vector : input array
/// \param vector : input array
/// \param constant : input constant
///
/// Output Parameters
Expand Down Expand Up @@ -298,6 +298,31 @@ template <typename T, typename Device> struct matrixCopy {
void operator()(const int& n1, const int& n2, const T* A, const int& LDA, T* B, const int& LDB);
};

template <typename T, typename Device>
struct matrix_mul_vector_op {
using Real = typename GetTypeReal<T>::type;
/// @brief a * b * beta by each column
///
/// Input Parameters
/// \param m : row number
/// \param n : column number
/// \param a : input matrix
/// \param lda : leading dimension of matrix a
/// \param b : input vector
/// \param alpha : factor
/// \param ldc : leading dimension of matrix c
///
/// Output Parameters
/// \param c : output matrix
void operator()(const int &m, const int &n,
T *a,
const int &lda,
const Real *b,
const Real alpha,
T *c,
const int &ldc);
};

template <typename T, typename Device>
struct apply_eigenvalues_op {
using Real = typename GetTypeReal<T>::type;
Expand All @@ -314,7 +339,7 @@ struct precondition_op {
T* psi_iter,
const int& nbase,
const int& notconv,
const Real* precondition,
const Real* precondition,
const Real* eigenvalues);
};

Expand Down Expand Up @@ -393,6 +418,17 @@ template <typename T> struct matrixCopy<T, base_device::DEVICE_GPU> {
const int& LDB);
};

template <typename T> struct matrix_mul_vector_op<T, base_device::DEVICE_GPU> {
using Real = typename GetTypeReal<T>::type;
void operator()(const int &m, const int &n,
T *a,
const int &lda,
const Real *b,
const Real alpha,
T *c,
const int &ldc);
};

void createGpuBlasHandle();
void destoryBLAShandle();

Expand Down
42 changes: 41 additions & 1 deletion source/source_base/kernels/rocm/math_kernel_op.hip.cu
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,15 @@ __launch_bounds__(1024) __global__
}
}

template <typename T, typename Real>
__launch_bounds__(1024) __global__
void matrix_multiply_vector_kernel(const int m, const int n, T *a, const int lda, const Real *b, const Real alpha, T *c, const int ldc){
int row = blockIdx.x * blockDim.x + threadIdx.x;
int col = blockIdx.y * blockDim.y + threadIdx.y;
if (col >= n || row >= m) return;
c[col * ldc + row] = a[col * lda + row] * b[col] * alpha;
}

hipblasOperation_t judge_trans_op(bool is_complex, const char& trans, const char* name)
{
if (trans == 'N')
Expand All @@ -159,7 +168,7 @@ hipblasOperation_t judge_trans_op(bool is_complex, const char& trans, const char
{
return HIPBLAS_OP_C;
}
else
else
{
ModuleBase::WARNING_QUIT(name, std::string("Unknown trans type ") + trans + std::string(" !"));
}
Expand Down Expand Up @@ -437,7 +446,38 @@ void matrixCopy<std::complex<double>, base_device::DEVICE_GPU>::operator()(const
hipCheckOnDebug();
}

template <>
void matrix_mul_vector_op<double, base_device::DEVICE_GPU>::operator()(const int &m, const int &n,
double *a, const int &lda, const double *b, const double alpha, double *c, const int &ldc){
dim3 thread(16, 16, 1);
dim3 block((m + thread.x - 1) / thread.x, (n + thread.y - 1) / thread.y, 1);
hipLaunchKernelGGL(HIP_KERNEL_NAME(matrix_multiply_vector_kernel<double, double>), dim3(block, thread),
m, n, a, lda, b, alpha, c, ldc);
hipCheckOnDebug();
}

template <>
void matrix_mul_vector_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int &m, const int &n,
std::complex<float> *a, const int &lda, const float *b, const float alpha, std::complex<float> *c, const int &ldc){
dim3 thread(16, 16, 1);
dim3 block((m + thread.x - 1) / thread.x, (n + thread.y - 1) / thread.y, 1);
hipLaunchKernelGGL(HIP_KERNEL_NAME(matrix_multiply_vector_kernel<thrust::complex<float>, float>), dim3(block, thread),
m, n, reinterpret_cast<thrust::complex<float>*>(a), lda,
b, alpha, reinterpret_cast<thrust::complex<float>*>(c), ldc);
hipCheckOnDebug();
}

template <>
void matrix_mul_vector_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(const int &m, const int &n,
std::complex<double> *a, const int &lda, const double *b, const double alpha, std::complex<double> *c, const int &ldc)
{
dim3 thread(16, 16, 1);
dim3 block((m + thread.x - 1) / thread.x, (n + thread.y - 1) / thread.y, 1);
hipLaunchKernelGGL(HIP_KERNEL_NAME(matrix_multiply_vector_kernel<thrust::complex<double>, double>), dim3(block, thread),
m, n, reinterpret_cast<thrust::complex<double>*>(a), lda,
b, alpha, reinterpret_cast<thrust::complex<double>*>(c), ldc);
hipCheckOnDebug();
}

// Explicitly instantiate functors for the types of functor registered.
template struct matrixCopy<double, base_device::DEVICE_GPU>;
Expand Down
68 changes: 68 additions & 0 deletions source/source_base/module_device/cuda/memory_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,16 @@ void set_memory_op<FPTYPE, base_device::DEVICE_GPU>::operator()(FPTYPE* arr,
cudaErrcheck(cudaMemset(arr, var, sizeof(FPTYPE) * size));
}

template <typename FPTYPE>
void set_memory_2d_op<FPTYPE, base_device::DEVICE_GPU>::operator()(FPTYPE* arr,
const size_t pitch,
const int var,
const size_t width,
const size_t height)
{
cudaErrcheck(cudaMemset2D(arr, sizeof(FPTYPE) * pitch , var, sizeof(FPTYPE) * width, height));
}

template <typename FPTYPE>
void synchronize_memory_op<FPTYPE, base_device::DEVICE_CPU, base_device::DEVICE_GPU>::operator()(
FPTYPE* arr_out,
Expand Down Expand Up @@ -112,6 +122,42 @@ void synchronize_memory_op<FPTYPE, base_device::DEVICE_GPU, base_device::DEVICE_
cudaErrcheck(cudaMemcpy(arr_out, arr_in, sizeof(FPTYPE) * size, cudaMemcpyDeviceToDevice));
}

template <typename FPTYPE>
void synchronize_memory_2d_op<FPTYPE, base_device::DEVICE_CPU, base_device::DEVICE_GPU>::operator()(
FPTYPE* arr_out,
const size_t dpitch,
const FPTYPE* arr_in,
const size_t spitch,
const size_t width,
const size_t height)
{
cudaErrcheck(cudaMemcpy2D(arr_out, dpitch * sizeof(FPTYPE), arr_in, spitch * sizeof(FPTYPE), width * sizeof(FPTYPE), height, cudaMemcpyDeviceToHost));
}

template <typename FPTYPE>
void synchronize_memory_2d_op<FPTYPE, base_device::DEVICE_GPU, base_device::DEVICE_CPU>::operator()(
FPTYPE* arr_out,
const size_t dpitch,
const FPTYPE* arr_in,
const size_t spitch,
const size_t width,
const size_t height)
{
cudaErrcheck(cudaMemcpy2D(arr_out, dpitch * sizeof(FPTYPE), arr_in, spitch * sizeof(FPTYPE), width * sizeof(FPTYPE), height, cudaMemcpyHostToDevice));
}

template <typename FPTYPE>
void synchronize_memory_2d_op<FPTYPE, base_device::DEVICE_GPU, base_device::DEVICE_GPU>::operator()(
FPTYPE* arr_out,
const size_t dpitch,
const FPTYPE* arr_in,
const size_t spitch,
const size_t width,
const size_t height)
{
cudaErrcheck(cudaMemcpy2D(arr_out, dpitch * sizeof(FPTYPE), arr_in, spitch * sizeof(FPTYPE), width * sizeof(FPTYPE), height, cudaMemcpyDeviceToDevice));
}

template <typename FPTYPE_out, typename FPTYPE_in>
struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_GPU, base_device::DEVICE_GPU>
{
Expand Down Expand Up @@ -196,6 +242,12 @@ template struct set_memory_op<double, base_device::DEVICE_GPU>;
template struct set_memory_op<std::complex<float>, base_device::DEVICE_GPU>;
template struct set_memory_op<std::complex<double>, base_device::DEVICE_GPU>;

template struct set_memory_2d_op<int, base_device::DEVICE_GPU>;
template struct set_memory_2d_op<float, base_device::DEVICE_GPU>;
template struct set_memory_2d_op<double, base_device::DEVICE_GPU>;
template struct set_memory_2d_op<std::complex<float>, base_device::DEVICE_GPU>;
template struct set_memory_2d_op<std::complex<double>, base_device::DEVICE_GPU>;

template struct synchronize_memory_op<int, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
template struct synchronize_memory_op<int, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
template struct synchronize_memory_op<int, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
Expand All @@ -212,6 +264,22 @@ template struct synchronize_memory_op<std::complex<double>, base_device::DEVICE_
template struct synchronize_memory_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
template struct synchronize_memory_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;

template struct synchronize_memory_2d_op<int, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
template struct synchronize_memory_2d_op<int, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
template struct synchronize_memory_2d_op<int, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
template struct synchronize_memory_2d_op<float, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
template struct synchronize_memory_2d_op<float, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
template struct synchronize_memory_2d_op<float, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
template struct synchronize_memory_2d_op<double, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
template struct synchronize_memory_2d_op<double, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
template struct synchronize_memory_2d_op<double, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
template struct synchronize_memory_2d_op<std::complex<float>, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
template struct synchronize_memory_2d_op<std::complex<float>, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
template struct synchronize_memory_2d_op<std::complex<float>, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
template struct synchronize_memory_2d_op<std::complex<double>, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
template struct synchronize_memory_2d_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
template struct synchronize_memory_2d_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;

template struct cast_memory_op<float, float, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
template struct cast_memory_op<double, double, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
template struct cast_memory_op<float, double, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
Expand Down
Loading
Loading