Skip to content

Commit f3f84c5

Browse files
authored
Merge branch 'develop' into fix-out-wfc-pw
2 parents 6409c3a + 0b77de9 commit f3f84c5

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+950
-902
lines changed

docs/advanced/input_files/input-main.md

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3541,11 +3541,7 @@ These variables are used to control berry phase and wannier90 interface paramete
35413541
- **Description**:
35423542
`td_lcut1` is the lower bound of the interval in the length gauge RT-TDDFT, where $x$ is the fractional coordinate:
35433543
$$
3544-
E(x)=
3545-
\begin{cases}
3546-
E_0, & \mathtt{cut1}\leqslant x \leqslant \mathtt{cut2} \\
3547-
-E_0\left(\dfrac{1}{\mathtt{cut1}+1-\mathtt{cut2}}-1\right), & \text{$0<x<\mathtt{cut1}$ or $\mathtt{cut2}<x<1$}
3548-
\end{cases}
3544+
E(x)=\begin{cases}E_0, & \mathtt{cut1}\leqslant x \leqslant \mathtt{cut2} \\-E_0\left(\dfrac{1}{\mathtt{cut1}+1-\mathtt{cut2}}-1\right), & 0 < x < \mathtt{cut1~~or~~cut2} < x < 1 \end{cases}
35493545
$$
35503546
- **Default**: 0.05
35513547

@@ -3555,11 +3551,7 @@ These variables are used to control berry phase and wannier90 interface paramete
35553551
- **Description**:
35563552
`td_lcut2` is the upper bound of the interval in the length gauge RT-TDDFT, where $x$ is the fractional coordinate:
35573553
$$
3558-
E(x)=
3559-
\begin{cases}
3560-
E_0, & \mathtt{cut1}\leqslant x \leqslant \mathtt{cut2} \\
3561-
-E_0\left(\dfrac{1}{\mathtt{cut1}+1-\mathtt{cut2}}-1\right), & \text{$0<x<\mathtt{cut1}$ or $\mathtt{cut2}<x<1$}
3562-
\end{cases}
3554+
E(x)=\begin{cases}E_0, & \mathtt{cut1}\leqslant x \leqslant \mathtt{cut2} \\-E_0\left(\dfrac{1}{\mathtt{cut1}+1-\mathtt{cut2}}-1\right), & 0 < x < \mathtt{cut1~~or~~cut2} < x < 1 \end{cases}
35633555
$$
35643556
- **Default**: 0.95
35653557

source/module_base/kernels/cuda/math_kernel_op.cu

Lines changed: 38 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -385,22 +385,15 @@ __global__ void matrix_transpose_kernel(
385385
}
386386
}
387387

388-
389388
template <typename T>
390-
__global__ void matrix_setTo_another_kernel(
391-
const int n,
392-
const int LDA,
393-
const int LDB,
394-
const T* matrix_A,
395-
T* matrix_B)
389+
__global__ void matrix_copy_kernel(const int n1, const int n2, const T* A, const int LDA, T* B, const int LDB)
396390
{
397-
int j = blockIdx.x * blockDim.x + threadIdx.x;
398-
if (j < LDA && j < LDB)
391+
const int i = blockIdx.x * blockDim.x + threadIdx.x;
392+
const int j = blockIdx.y * blockDim.y + threadIdx.y;
393+
394+
if (i < n1 && j < n2)
399395
{
400-
for (int i = 0; i < n; i++)
401-
{
402-
matrix_B[i * LDB + j] = matrix_A[i * LDA + j];
403-
}
396+
B[i * LDB + j] = A[i * LDA + j];
404397
}
405398
}
406399

@@ -980,40 +973,43 @@ void matrixTranspose_op<std::complex<double>, base_device::DEVICE_GPU>::operator
980973
}
981974

982975
template <>
983-
void matrixSetToAnother<double, base_device::DEVICE_GPU>::operator()(const int& n,
984-
const double* A,
985-
const int& LDA,
986-
double* B,
987-
const int& LDB)
976+
void matrixCopy<double, base_device::DEVICE_GPU>::operator()(const int& n1,
977+
const int& n2,
978+
const double* A,
979+
const int& LDA,
980+
double* B,
981+
const int& LDB)
988982
{
989-
int thread = 1024;
990-
int block = (LDA + thread - 1) / thread;
991-
matrix_setTo_another_kernel<double> <<<block, thread >>> (n, LDA, LDB, A, B);
983+
const dim3 blockSize(16, 16);
984+
const dim3 gridSize((n1 + blockSize.x - 1) / blockSize.x, (n2 + blockSize.y - 1) / blockSize.y);
985+
matrix_copy_kernel<double> <<<gridSize, blockSize >>> (n1, n2, A, LDA, B, LDB);
992986
cudaCheckOnDebug();
993987
}
994988
template <>
995-
void matrixSetToAnother<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int& n,
996-
const std::complex<float>* A,
997-
const int& LDA,
998-
std::complex<float>* B,
999-
const int& LDB)
989+
void matrixCopy<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int& n1,
990+
const int& n2,
991+
const std::complex<float>* A,
992+
const int& LDA,
993+
std::complex<float>* B,
994+
const int& LDB)
1000995
{
1001-
int thread = 1024;
1002-
int block = (LDA + thread - 1) / thread;
1003-
matrix_setTo_another_kernel<thrust::complex<float>> <<<block, thread >>> (n, LDA, LDB, reinterpret_cast<const thrust::complex<float>*>(A), reinterpret_cast<thrust::complex<float>*>(B));
996+
const dim3 blockSize(16, 16);
997+
const dim3 gridSize((n1 + blockSize.x - 1) / blockSize.x, (n2 + blockSize.y - 1) / blockSize.y);
998+
matrix_copy_kernel<thrust::complex<float>> <<<gridSize, blockSize >>> (n1, n2, reinterpret_cast<const thrust::complex<float>*>(A), LDA, reinterpret_cast<thrust::complex<float>*>(B), LDB);
1004999
cudaCheckOnDebug();
1000+
10051001
}
10061002
template <>
1007-
void matrixSetToAnother<std::complex<double>, base_device::DEVICE_GPU>::operator()(const int& n,
1008-
const std::complex<double>* A,
1009-
const int& LDA,
1010-
std::complex<double>* B,
1011-
const int& LDB)
1003+
void matrixCopy<std::complex<double>, base_device::DEVICE_GPU>::operator()(const int& n1,
1004+
const int& n2,
1005+
const std::complex<double>* A,
1006+
const int& LDA,
1007+
std::complex<double>* B,
1008+
const int& LDB)
10121009
{
1013-
int thread = 1024;
1014-
int block = (LDA + thread - 1) / thread;
1015-
matrix_setTo_another_kernel<thrust::complex<double>> <<<block, thread >>> (n, LDA, LDB, reinterpret_cast<const thrust::complex<double>*>(A), reinterpret_cast<thrust::complex<double>*>(B));
1016-
1010+
const dim3 blockSize(16, 16);
1011+
const dim3 gridSize((n1 + blockSize.x - 1) / blockSize.x, (n2 + blockSize.y - 1) / blockSize.y);
1012+
matrix_copy_kernel<thrust::complex<double>> <<<gridSize, blockSize >>> (n1, n2, reinterpret_cast<const thrust::complex<double>*>(A), LDA, reinterpret_cast<thrust::complex<double>*>(B), LDB);
10171013
cudaCheckOnDebug();
10181014
}
10191015

@@ -1027,23 +1023,23 @@ template struct vector_mul_vector_op<std::complex<float>, base_device::DEVICE_GP
10271023
template struct vector_div_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
10281024
template struct constantvector_addORsub_constantVector_op<float, base_device::DEVICE_GPU>;
10291025
template struct constantvector_addORsub_constantVector_op<std::complex<float>, base_device::DEVICE_GPU>;
1030-
template struct matrixSetToAnother<std::complex<float>, base_device::DEVICE_GPU>;
1026+
template struct matrixCopy<std::complex<float>, base_device::DEVICE_GPU>;
10311027

10321028
template struct dot_real_op<std::complex<double>, base_device::DEVICE_GPU>;
10331029
template struct calc_grad_with_block_op<std::complex<double>, base_device::DEVICE_GPU>;
10341030
template struct line_minimize_with_block_op<std::complex<double>, base_device::DEVICE_GPU>;
10351031
template struct vector_div_constant_op<std::complex<double>, base_device::DEVICE_GPU>;
10361032
template struct vector_mul_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
10371033
template struct vector_div_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
1034+
template struct constantvector_addORsub_constantVector_op<double, base_device::DEVICE_GPU>;
10381035
template struct constantvector_addORsub_constantVector_op<std::complex<double>, base_device::DEVICE_GPU>;
1039-
template struct matrixSetToAnother<std::complex<double>, base_device::DEVICE_GPU>;
1036+
template struct matrixCopy<double, base_device::DEVICE_GPU>;
1037+
template struct matrixCopy<std::complex<double>, base_device::DEVICE_GPU>;
10401038

10411039
#ifdef __LCAO
10421040
template struct dot_real_op<double, base_device::DEVICE_GPU>;
10431041
template struct vector_div_constant_op<double, base_device::DEVICE_GPU>;
10441042
template struct vector_mul_vector_op<double, base_device::DEVICE_GPU>;
10451043
template struct vector_div_vector_op<double, base_device::DEVICE_GPU>;
1046-
template struct matrixSetToAnother<double, base_device::DEVICE_GPU>;
1047-
template struct constantvector_addORsub_constantVector_op<double, base_device::DEVICE_GPU>;
10481044
#endif
10491045
} // namespace ModuleBase

source/module_base/kernels/math_kernel_op.cpp

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -337,16 +337,16 @@ struct matrixTranspose_op<T, base_device::DEVICE_CPU>
337337
};
338338

339339
template <typename T>
340-
struct matrixSetToAnother<T, base_device::DEVICE_CPU>
340+
struct matrixCopy<T, base_device::DEVICE_CPU>
341341
{
342-
void operator()(const int& n, const T* A, const int& LDA, T* B, const int& LDB)
342+
void operator()(const int& n1, const int& n2, const T* A, const int& LDA, T* B, const int& LDB)
343343
{
344344
#ifdef _OPENMP
345345
#pragma omp parallel for collapse(2) schedule(static, 8192 / sizeof(T))
346346
#endif
347-
for (int i = 0; i < n; i++)
347+
for (int i = 0; i < n1; i++)
348348
{
349-
for (int j = 0; j < LDA; j++)
349+
for (int j = 0; j < n2; j++)
350350
{
351351
B[i * LDB + j] = A[i * LDA + j];
352352
}
@@ -367,7 +367,7 @@ template struct vector_mul_vector_op<std::complex<float>, base_device::DEVICE_CP
367367
template struct vector_div_vector_op<std::complex<float>, base_device::DEVICE_CPU>;
368368
template struct constantvector_addORsub_constantVector_op<std::complex<float>, base_device::DEVICE_CPU>;
369369
template struct matrixTranspose_op<std::complex<float>, base_device::DEVICE_CPU>;
370-
template struct matrixSetToAnother<std::complex<float>, base_device::DEVICE_CPU>;
370+
template struct matrixCopy<std::complex<float>, base_device::DEVICE_CPU>;
371371
template struct calc_grad_with_block_op<std::complex<float>, base_device::DEVICE_CPU>;
372372
template struct line_minimize_with_block_op<std::complex<float>, base_device::DEVICE_CPU>;
373373

@@ -385,7 +385,8 @@ template struct vector_mul_vector_op<std::complex<double>, base_device::DEVICE_C
385385
template struct vector_div_vector_op<std::complex<double>, base_device::DEVICE_CPU>;
386386
template struct constantvector_addORsub_constantVector_op<std::complex<double>, base_device::DEVICE_CPU>;
387387
template struct matrixTranspose_op<std::complex<double>, base_device::DEVICE_CPU>;
388-
template struct matrixSetToAnother<std::complex<double>, base_device::DEVICE_CPU>;
388+
template struct matrixCopy<double, base_device::DEVICE_CPU>;
389+
template struct matrixCopy<std::complex<double>, base_device::DEVICE_CPU>;
389390
template struct calc_grad_with_block_op<std::complex<double>, base_device::DEVICE_CPU>;
390391
template struct line_minimize_with_block_op<std::complex<double>, base_device::DEVICE_CPU>;
391392

@@ -394,7 +395,6 @@ template struct vector_mul_vector_op<double, base_device::DEVICE_CPU>;
394395
template struct vector_div_constant_op<double, base_device::DEVICE_CPU>;
395396
template struct vector_div_vector_op<double, base_device::DEVICE_CPU>;
396397
template struct matrixTranspose_op<double, base_device::DEVICE_CPU>;
397-
template struct matrixSetToAnother<double, base_device::DEVICE_CPU>;
398398
template struct constantvector_addORsub_constantVector_op<double, base_device::DEVICE_CPU>;
399399
#endif
400400
#ifdef __DSP

source/module_base/kernels/math_kernel_op.h

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -298,19 +298,19 @@ template <typename T, typename Device> struct matrixTranspose_op {
298298
const T *input_matrix, T *output_matrix);
299299
};
300300

301-
template <typename T, typename Device> struct matrixSetToAnother {
302-
/// @brief initialize matrix B with A
301+
template <typename T, typename Device> struct matrixCopy {
302+
/// @brief copy matrix A to B, they can have different leading dimensions
303303
///
304304
/// Input Parameters
305-
/// \param n : first dimension of matrix
305+
/// \param n1 : first dimension of matrix
306+
/// \param n2 : second dimension of matrix
306307
/// \param A : input matrix A
307308
/// \param LDA : leading dimension of A
308309
/// \param LDB : leading dimension of B
309310
///
310311
/// Output Parameters
311312
/// \param B : output matrix B
312-
void operator()(const int &n, const T *A, const int &LDA,
313-
T *B, const int &LDB);
313+
void operator()(const int& n1, const int& n2, const T* A, const int& LDA, T* B, const int& LDB);
314314
};
315315

316316
#if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
@@ -370,12 +370,13 @@ struct constantvector_addORsub_constantVector_op<T, base_device::DEVICE_GPU> {
370370
const Real constant2);
371371
};
372372

373-
template <typename T> struct matrixSetToAnother<T, base_device::DEVICE_GPU> {
374-
void operator()(const int &n,
375-
const T *A, // input
376-
const int &LDA,
377-
T *B, // output
378-
const int &LDB);
373+
template <typename T> struct matrixCopy<T, base_device::DEVICE_GPU> {
374+
void operator()(const int& n1,
375+
const int& n2,
376+
const T* A, // input
377+
const int& LDA,
378+
T* B, // output
379+
const int& LDB);
379380
};
380381

381382
void createGpuBlasHandle();

0 commit comments

Comments
 (0)