Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 29 additions & 33 deletions source/module_base/kernels/cuda/math_kernel_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -385,22 +385,15 @@ __global__ void matrix_transpose_kernel(
}
}


template <typename T>
__global__ void matrix_setTo_another_kernel(
const int n,
const int LDA,
const int LDB,
const T* matrix_A,
T* matrix_B)
__global__ void matrix_copy_kernel(const int n1, const int n2, const T* A, const int LDA, T* B, const int LDB)
{
int j = blockIdx.x * blockDim.x + threadIdx.x;
if (j < LDA && j < LDB)
const int i = blockIdx.x * blockDim.x + threadIdx.x;
const int j = blockIdx.y * blockDim.y + threadIdx.y;

if (i < n1 && j < n2)
{
for (int i = 0; i < n; i++)
{
matrix_B[i * LDB + j] = matrix_A[i * LDA + j];
}
B[i * LDB + j] = A[i * LDA + j];
}
}

Expand Down Expand Up @@ -1012,43 +1005,46 @@ void matrixTranspose_op<std::complex<double>, base_device::DEVICE_GPU>::operator
}

template <>
void matrixSetToAnother<double, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* d,
const int& n,
void matrixCopy<double, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* d,
const int& n1,
const int& n2,
const double* A,
const int& LDA,
double* B,
const int& LDB)
{
int thread = 1024;
int block = (LDA + thread - 1) / thread;
matrix_setTo_another_kernel<double> <<<block, thread >>> (n, LDA, LDB, A, B);
const dim3 blockSize(16, 16);
const dim3 gridSize((n1 + blockSize.x - 1) / blockSize.x, (n2 + blockSize.y - 1) / blockSize.y);
matrix_copy_kernel<double> <<<gridSize, blockSize >>> (n1, n2, A, LDA, B, LDB);
cudaCheckOnDebug();
}
template <>
void matrixSetToAnother<std::complex<float>, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* d,
const int& n,
void matrixCopy<std::complex<float>, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* d,
const int& n1,
const int& n2,
const std::complex<float>* A,
const int& LDA,
std::complex<float>* B,
const int& LDB)
{
int thread = 1024;
int block = (LDA + thread - 1) / thread;
matrix_setTo_another_kernel<thrust::complex<float>> <<<block, thread >>> (n, LDA, LDB, reinterpret_cast<const thrust::complex<float>*>(A), reinterpret_cast<thrust::complex<float>*>(B));
const dim3 blockSize(16, 16);
const dim3 gridSize((n1 + blockSize.x - 1) / blockSize.x, (n2 + blockSize.y - 1) / blockSize.y);
matrix_copy_kernel<thrust::complex<float>> <<<gridSize, blockSize >>> (n1, n2, reinterpret_cast<const thrust::complex<float>*>(A), LDA, reinterpret_cast<thrust::complex<float>*>(B), LDB);
cudaCheckOnDebug();

}
template <>
void matrixSetToAnother<std::complex<double>, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* d,
const int& n,
void matrixCopy<std::complex<double>, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* d,
const int& n1,
const int& n2,
const std::complex<double>* A,
const int& LDA,
std::complex<double>* B,
const int& LDB)
{
int thread = 1024;
int block = (LDA + thread - 1) / thread;
matrix_setTo_another_kernel<thrust::complex<double>> <<<block, thread >>> (n, LDA, LDB, reinterpret_cast<const thrust::complex<double>*>(A), reinterpret_cast<thrust::complex<double>*>(B));

const dim3 blockSize(16, 16);
const dim3 gridSize((n1 + blockSize.x - 1) / blockSize.x, (n2 + blockSize.y - 1) / blockSize.y);
matrix_copy_kernel<thrust::complex<double>> <<<gridSize, blockSize >>> (n1, n2, reinterpret_cast<const thrust::complex<double>*>(A), LDA, reinterpret_cast<thrust::complex<double>*>(B), LDB);
cudaCheckOnDebug();
}

Expand All @@ -1062,23 +1058,23 @@ template struct vector_mul_vector_op<std::complex<float>, base_device::DEVICE_GP
template struct vector_div_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
template struct constantvector_addORsub_constantVector_op<float, base_device::DEVICE_GPU>;
template struct constantvector_addORsub_constantVector_op<std::complex<float>, base_device::DEVICE_GPU>;
template struct matrixSetToAnother<std::complex<float>, base_device::DEVICE_GPU>;
template struct matrixCopy<std::complex<float>, base_device::DEVICE_GPU>;

template struct dot_real_op<std::complex<double>, base_device::DEVICE_GPU>;
template struct calc_grad_with_block_op<std::complex<double>, base_device::DEVICE_GPU>;
template struct line_minimize_with_block_op<std::complex<double>, base_device::DEVICE_GPU>;
template struct vector_div_constant_op<std::complex<double>, base_device::DEVICE_GPU>;
template struct vector_mul_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
template struct vector_div_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
template struct constantvector_addORsub_constantVector_op<double, base_device::DEVICE_GPU>;
template struct constantvector_addORsub_constantVector_op<std::complex<double>, base_device::DEVICE_GPU>;
template struct matrixSetToAnother<std::complex<double>, base_device::DEVICE_GPU>;
template struct matrixCopy<double, base_device::DEVICE_GPU>;
template struct matrixCopy<std::complex<double>, base_device::DEVICE_GPU>;

#ifdef __LCAO
template struct dot_real_op<double, base_device::DEVICE_GPU>;
template struct vector_div_constant_op<double, base_device::DEVICE_GPU>;
template struct vector_mul_vector_op<double, base_device::DEVICE_GPU>;
template struct vector_div_vector_op<double, base_device::DEVICE_GPU>;
template struct matrixSetToAnother<double, base_device::DEVICE_GPU>;
template struct constantvector_addORsub_constantVector_op<double, base_device::DEVICE_GPU>;
#endif
} // namespace ModuleBase
20 changes: 13 additions & 7 deletions source/module_base/kernels/math_kernel_op.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -346,16 +346,22 @@ struct matrixTranspose_op<T, base_device::DEVICE_CPU>
};

template <typename T>
struct matrixSetToAnother<T, base_device::DEVICE_CPU>
struct matrixCopy<T, base_device::DEVICE_CPU>
{
void operator()(const base_device::DEVICE_CPU* d, const int& n, const T* A, const int& LDA, T* B, const int& LDB)
void operator()(const base_device::DEVICE_CPU* d,
const int& n1,
const int& n2,
const T* A,
const int& LDA,
T* B,
const int& LDB)
{
#ifdef _OPENMP
#pragma omp parallel for collapse(2) schedule(static, 8192 / sizeof(T))
#endif
for (int i = 0; i < n; i++)
for (int i = 0; i < n1; i++)
{
for (int j = 0; j < LDA; j++)
for (int j = 0; j < n2; j++)
{
B[i * LDB + j] = A[i * LDA + j];
}
Expand All @@ -376,7 +382,7 @@ template struct vector_mul_vector_op<std::complex<float>, base_device::DEVICE_CP
template struct vector_div_vector_op<std::complex<float>, base_device::DEVICE_CPU>;
template struct constantvector_addORsub_constantVector_op<std::complex<float>, base_device::DEVICE_CPU>;
template struct matrixTranspose_op<std::complex<float>, base_device::DEVICE_CPU>;
template struct matrixSetToAnother<std::complex<float>, base_device::DEVICE_CPU>;
template struct matrixCopy<std::complex<float>, base_device::DEVICE_CPU>;
template struct calc_grad_with_block_op<std::complex<float>, base_device::DEVICE_CPU>;
template struct line_minimize_with_block_op<std::complex<float>, base_device::DEVICE_CPU>;

Expand All @@ -394,7 +400,8 @@ template struct vector_mul_vector_op<std::complex<double>, base_device::DEVICE_C
template struct vector_div_vector_op<std::complex<double>, base_device::DEVICE_CPU>;
template struct constantvector_addORsub_constantVector_op<std::complex<double>, base_device::DEVICE_CPU>;
template struct matrixTranspose_op<std::complex<double>, base_device::DEVICE_CPU>;
template struct matrixSetToAnother<std::complex<double>, base_device::DEVICE_CPU>;
template struct matrixCopy<double, base_device::DEVICE_CPU>;
template struct matrixCopy<std::complex<double>, base_device::DEVICE_CPU>;
template struct calc_grad_with_block_op<std::complex<double>, base_device::DEVICE_CPU>;
template struct line_minimize_with_block_op<std::complex<double>, base_device::DEVICE_CPU>;

Expand All @@ -403,7 +410,6 @@ template struct vector_mul_vector_op<double, base_device::DEVICE_CPU>;
template struct vector_div_constant_op<double, base_device::DEVICE_CPU>;
template struct vector_div_vector_op<double, base_device::DEVICE_CPU>;
template struct matrixTranspose_op<double, base_device::DEVICE_CPU>;
template struct matrixSetToAnother<double, base_device::DEVICE_CPU>;
template struct constantvector_addORsub_constantVector_op<double, base_device::DEVICE_CPU>;
#endif
#ifdef __DSP
Expand Down
24 changes: 13 additions & 11 deletions source/module_base/kernels/math_kernel_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -309,20 +309,20 @@ template <typename T, typename Device> struct matrixTranspose_op {
const T *input_matrix, T *output_matrix);
};

template <typename T, typename Device> struct matrixSetToAnother {
/// @brief initialize matrix B with A
template <typename T, typename Device> struct matrixCopy {
/// @brief copy matrix A to B, they can have different leading dimensions
///
/// Input Parameters
/// \param d : the type of computing device
/// \param n : first dimension of matrix
/// \param n1 : first dimension of matrix
/// \param n2 : second dimension of matrix
/// \param A : input matrix A
/// \param LDA : leading dimension of A
/// \param LDB : leading dimension of B
///
/// Output Parameters
/// \param B : output matrix B
void operator()(const Device *d, const int &n, const T *A, const int &LDA,
T *B, const int &LDB);
void operator()(const Device* d, const int& n1, const int& n2, const T* A, const int& LDA, T* B, const int& LDB);
};

#if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
Expand Down Expand Up @@ -382,12 +382,14 @@ struct constantvector_addORsub_constantVector_op<T, base_device::DEVICE_GPU> {
const Real constant2);
};

template <typename T> struct matrixSetToAnother<T, base_device::DEVICE_GPU> {
void operator()(const base_device::DEVICE_GPU *d, const int &n,
const T *A, // input
const int &LDA,
T *B, // output
const int &LDB);
template <typename T> struct matrixCopy<T, base_device::DEVICE_GPU> {
void operator()(const base_device::DEVICE_GPU* d,
const int& n1,
const int& n2,
const T* A, // input
const int& LDA,
T* B, // output
const int& LDB);
};

void createGpuBlasHandle();
Expand Down
65 changes: 32 additions & 33 deletions source/module_base/kernels/rocm/math_kernel_op.hip.cu
Original file line number Diff line number Diff line change
Expand Up @@ -307,23 +307,16 @@ __global__ void matrix_transpose_kernel(
}
}


template <typename T>
__launch_bounds__(1024)
__global__ void matrix_setTo_another_kernel(
const int n,
const int LDA,
const int LDB,
const T* matrix_A,
T* matrix_B)
{
int j = blockIdx.x * blockDim.x + threadIdx.x;
if (j < LDA && j < LDB)
__launch_bounds__(1024) __global__
void matrix_copy_kernel(const int n1, const int n2, const T* A, const int LDA, T* B, const int LDB)
{
const int i = blockIdx.x * blockDim.x + threadIdx.x;
const int j = blockIdx.y * blockDim.y + threadIdx.y;

if (i < n1 && j < n2)
{
for (int i = 0; i < n; i++)
{
matrix_B[i * LDB + j] = matrix_A[i * LDA + j];
}
B[i * LDB + j] = A[i * LDA + j];
}
}

Expand Down Expand Up @@ -921,42 +914,48 @@ void matrixTranspose_op<std::complex<double>, base_device::DEVICE_GPU>::operator
}

template <>
void matrixSetToAnother<double, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* d,
const int& n,
void matrixCopy<double, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* d,
const int& n1,
const int& n2,
const double* A,
const int& LDA,
double* B,
const int& LDB)
{
int thread = 1024;
int block = (LDA + thread - 1) / thread;
hipLaunchKernelGGL(HIP_KERNEL_NAME(matrix_setTo_another_kernel<double>), dim3(block), dim3(thread), 0, 0, n, LDA, LDB, A, B);
const dim3 blockSize(16, 16);
const dim3 gridSize((n1 + blockSize.x - 1) / blockSize.x, (n2 + blockSize.y - 1) / blockSize.y);

hipLaunchKernelGGL(HIP_KERNEL_NAME(matrix_copy_kernel<double>), gridSize, blockSize, 0, 0, n1, n2, A, LDA, B, LDB);
hipCheckOnDebug();
}
template <>
void matrixSetToAnother<std::complex<float>, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* d,
const int& n,
void matrixCopy<std::complex<float>, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* d,
const int& n1,
const int& n2,
const std::complex<float>* A,
const int& LDA,
std::complex<float>* B,
const int& LDB)
{
int thread = 1024;
int block = (LDA + thread - 1) / thread;
hipLaunchKernelGGL(HIP_KERNEL_NAME(matrix_setTo_another_kernel<thrust::complex<float>>), dim3(block), dim3(thread), 0, 0, n, LDA, LDB, reinterpret_cast<const thrust::complex<float>*>(A), reinterpret_cast<thrust::complex<float>*>(B));
const dim3 blockSize(16, 16);
const dim3 gridSize((n1 + blockSize.x - 1) / blockSize.x, (n2 + blockSize.y - 1) / blockSize.y);

hipLaunchKernelGGL(HIP_KERNEL_NAME(matrix_copy_kernel<thrust::complex<float>>), gridSize, blockSize, 0, 0, n1, n2, reinterpret_cast<const thrust::complex<float>*>(A), LDA, reinterpret_cast<thrust::complex<float>*>(B), LDB);
hipCheckOnDebug();
}
template <>
void matrixSetToAnother<std::complex<double>, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* d,
const int& n,
void matrixCopy<std::complex<double>, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* d,
const int& n1,
const int& n2,
const std::complex<double>* A,
const int& LDA,
std::complex<double>* B,
const int& LDB)
{
int thread = 1024;
int block = (LDA + thread - 1) / thread;
hipLaunchKernelGGL(HIP_KERNEL_NAME(matrix_setTo_another_kernel<thrust::complex<double>>), dim3(block), dim3(thread), 0, 0, n, LDA, LDB, reinterpret_cast<const thrust::complex<double>*>(A), reinterpret_cast<thrust::complex<double>*>(B));
const dim3 blockSize(16, 16);
const dim3 gridSize((n1 + blockSize.x - 1) / blockSize.x, (n2 + blockSize.y - 1) / blockSize.y);

hipLaunchKernelGGL(HIP_KERNEL_NAME(matrix_copy_kernel<thrust::complex<double>>), gridSize, blockSize, 0, 0, n1, n2, reinterpret_cast<const thrust::complex<double>*>(A), LDA, reinterpret_cast<thrust::complex<double>*>(B), LDB);
hipCheckOnDebug();
}

Expand All @@ -970,7 +969,7 @@ template struct vector_div_constant_op<std::complex<float>, base_device::DEVICE_
template struct vector_mul_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
template struct vector_div_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
template struct constantvector_addORsub_constantVector_op<std::complex<float>, base_device::DEVICE_GPU>;
template struct matrixSetToAnother<std::complex<float>, base_device::DEVICE_GPU>;
template struct matrixCopy<std::complex<float>, base_device::DEVICE_GPU>;

template struct dot_real_op<std::complex<double>, base_device::DEVICE_GPU>;
template struct calc_grad_with_block_op<std::complex<double>, base_device::DEVICE_GPU>;
Expand All @@ -979,14 +978,14 @@ template struct vector_div_constant_op<std::complex<double>, base_device::DEVICE
template struct vector_mul_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
template struct vector_div_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
template struct constantvector_addORsub_constantVector_op<std::complex<double>, base_device::DEVICE_GPU>;
template struct matrixSetToAnother<std::complex<double>, base_device::DEVICE_GPU>;
template struct matrixCopy<std::complex<double>, base_device::DEVICE_GPU>;

#ifdef __LCAO
template struct dot_real_op<double, base_device::DEVICE_GPU>;
template struct vector_div_constant_op<double, base_device::DEVICE_GPU>;
template struct vector_mul_vector_op<double, base_device::DEVICE_GPU>;
template struct vector_div_vector_op<double, base_device::DEVICE_GPU>;
template struct matrixSetToAnother<double, base_device::DEVICE_GPU>;
template struct matrixCopy<double, base_device::DEVICE_GPU>;
template struct constantvector_addORsub_constantVector_op<double, base_device::DEVICE_GPU>;
#endif
} // namespace ModuleBase
Loading