Skip to content

Commit 3f8fe4f

Browse files
RT-TDDFT GPU Acceleration: RT-TD now fully support GPU computation (#5773)
* Phase 1 of RT-TDDFT GPU Acceleration: Rewriting existing code using Tensor * [pre-commit.ci lite] apply automatic fixes * Initialize int info in bandenergy.cpp * Initialize double aa, bb in bandenergy.cpp * Fix a bug where CopyFrom caused shared data between tensors, using =(assignment operator overload) instead * RT-TDDFT GPU Acceleration (Phase 2): Adding needed BLAS and LAPACK support for Tensor on CPU and refactoring linear algebra operations in TDDFT * LAPACK wrapper functions: change const basic-type input parameters from pass-by-reference to pass-by-value * Did nothing, just formatting esolver.cpp * Core algorithm: RT-TD now has preliminary support for GPU computation * Fix GitHub CI CUDA build bug due to deleted variable * Refactor some files * Getting ready for gathering MPI processes * MPI multi-process compatibility * Fix GitHub CI MPI compilation bug * Minor fix and refactor * Initialize double aa, bb and one line for one variable * Rename bandenergy.cpp to band_energy.cpp and corresponding adjustments * Fix compile error and change CMakeLists accordingly * Initialize int naroc * Initialize MPI related variables: myid, num_procs and root_proc * Refactor Propagator class implementation into multiple files for better code organization * Remove all GlobalV::ofs_running from RT-TDDFT core algorithms and pass it as an input parameter instead * Add assert in some places and optimize redundant index calculations in nested loops --------- Co-authored-by: pre-commit-ci-lite[bot] <117423508+pre-commit-ci-lite[bot]@users.noreply.github.com>
1 parent 0098171 commit 3f8fe4f

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

47 files changed

+4131
-1104
lines changed

source/Makefile.Objects

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -557,10 +557,13 @@ OBJS_IO_LCAO=cal_r_overlap_R.o\
557557

558558
OBJS_LCAO=evolve_elec.o\
559559
evolve_psi.o\
560-
bandenergy.o\
560+
band_energy.o\
561561
middle_hamilt.o\
562562
norm_psi.o\
563563
propagator.o\
564+
propagator_cn2.o\
565+
propagator_taylor.o\
566+
propagator_etrs.o\
564567
td_velocity.o\
565568
td_current.o\
566569
snap_psibeta_half_tddft.o\

source/module_base/lapack_connector.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -133,8 +133,8 @@ extern "C"
133133

134134
// zgetrf computes the LU factorization of a general matrix
135135
// while zgetri takes its output to perform matrix inversion
136-
void zgetrf_(const int* m, const int *n, const std::complex<double> *A, const int *lda, int *ipiv, const int* info);
137-
void zgetri_(const int* n, std::complex<double> *A, const int *lda, int *ipiv, std::complex<double> *work, int *lwork, const int *info);
136+
void zgetrf_(const int* m, const int *n, std::complex<double> *A, const int *lda, int *ipiv, int* info);
137+
void zgetri_(const int* n, std::complex<double>* A, const int* lda, const int* ipiv, std::complex<double>* work, const int* lwork, int* info);
138138

139139
// if trans=='N': C = alpha * A * A.H + beta * C
140140
// if trans=='C': C = alpha * A.H * A + beta * C

source/module_base/module_container/ATen/kernels/cuda/lapack.cu

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,49 @@ struct lapack_dngvd<T, DEVICE_GPU> {
117117
}
118118
};
119119

120+
template <typename T>
121+
struct lapack_getrf<T, DEVICE_GPU> {
122+
void operator()(
123+
const int& m,
124+
const int& n,
125+
T* Mat,
126+
const int& lda,
127+
int* ipiv)
128+
{
129+
cuSolverConnector::getrf(cusolver_handle, m, n, Mat, lda, ipiv);
130+
}
131+
};
132+
133+
template <typename T>
134+
struct lapack_getri<T, DEVICE_GPU> {
135+
void operator()(
136+
const int& n,
137+
T* Mat,
138+
const int& lda,
139+
const int* ipiv,
140+
T* work,
141+
const int& lwork)
142+
{
143+
throw std::runtime_error("cuSOLVER does not provide LU-based matrix inversion interface (getri). To compute the inverse on GPU, use getrs instead.");
144+
}
145+
};
146+
147+
template <typename T>
148+
struct lapack_getrs<T, DEVICE_GPU> {
149+
void operator()(
150+
const char& trans,
151+
const int& n,
152+
const int& nrhs,
153+
T* A,
154+
const int& lda,
155+
const int* ipiv,
156+
T* B,
157+
const int& ldb)
158+
{
159+
cuSolverConnector::getrs(cusolver_handle, trans, n, nrhs, A, lda, ipiv, B, ldb);
160+
}
161+
};
162+
120163
template struct set_matrix<float, DEVICE_GPU>;
121164
template struct set_matrix<double, DEVICE_GPU>;
122165
template struct set_matrix<std::complex<float>, DEVICE_GPU>;
@@ -142,5 +185,20 @@ template struct lapack_dngvd<double, DEVICE_GPU>;
142185
template struct lapack_dngvd<std::complex<float>, DEVICE_GPU>;
143186
template struct lapack_dngvd<std::complex<double>, DEVICE_GPU>;
144187

188+
template struct lapack_getrf<float, DEVICE_GPU>;
189+
template struct lapack_getrf<double, DEVICE_GPU>;
190+
template struct lapack_getrf<std::complex<float>, DEVICE_GPU>;
191+
template struct lapack_getrf<std::complex<double>, DEVICE_GPU>;
192+
193+
template struct lapack_getri<float, DEVICE_GPU>;
194+
template struct lapack_getri<double, DEVICE_GPU>;
195+
template struct lapack_getri<std::complex<float>, DEVICE_GPU>;
196+
template struct lapack_getri<std::complex<double>, DEVICE_GPU>;
197+
198+
template struct lapack_getrs<float, DEVICE_GPU>;
199+
template struct lapack_getrs<double, DEVICE_GPU>;
200+
template struct lapack_getrs<std::complex<float>, DEVICE_GPU>;
201+
template struct lapack_getrs<std::complex<double>, DEVICE_GPU>;
202+
145203
} // namespace kernels
146204
} // namespace container

source/module_base/module_container/ATen/kernels/lapack.cpp

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,61 @@ struct lapack_dngvd<T, DEVICE_CPU> {
124124
}
125125
};
126126

127+
template <typename T>
128+
struct lapack_getrf<T, DEVICE_CPU> {
129+
void operator()(
130+
const int& m,
131+
const int& n,
132+
T* Mat,
133+
const int& lda,
134+
int* ipiv)
135+
{
136+
int info = 0;
137+
lapackConnector::getrf(m, n, Mat, lda, ipiv, info);
138+
if (info != 0) {
139+
throw std::runtime_error("getrf failed with info = " + std::to_string(info));
140+
}
141+
}
142+
};
143+
144+
template <typename T>
145+
struct lapack_getri<T, DEVICE_CPU> {
146+
void operator()(
147+
const int& n,
148+
T* Mat,
149+
const int& lda,
150+
const int* ipiv,
151+
T* work,
152+
const int& lwork)
153+
{
154+
int info = 0;
155+
lapackConnector::getri(n, Mat, lda, ipiv, work, lwork, info);
156+
if (info != 0) {
157+
throw std::runtime_error("getri failed with info = " + std::to_string(info));
158+
}
159+
}
160+
};
161+
162+
template <typename T>
163+
struct lapack_getrs<T, DEVICE_CPU> {
164+
void operator()(
165+
const char& trans,
166+
const int& n,
167+
const int& nrhs,
168+
T* A,
169+
const int& lda,
170+
const int* ipiv,
171+
T* B,
172+
const int& ldb)
173+
{
174+
int info = 0;
175+
lapackConnector::getrs(trans, n, nrhs, A, lda, ipiv, B, ldb, info);
176+
if (info != 0) {
177+
throw std::runtime_error("getrs failed with info = " + std::to_string(info));
178+
}
179+
}
180+
};
181+
127182
template struct set_matrix<float, DEVICE_CPU>;
128183
template struct set_matrix<double, DEVICE_CPU>;
129184
template struct set_matrix<std::complex<float>, DEVICE_CPU>;
@@ -149,5 +204,20 @@ template struct lapack_dngvd<double, DEVICE_CPU>;
149204
template struct lapack_dngvd<std::complex<float>, DEVICE_CPU>;
150205
template struct lapack_dngvd<std::complex<double>, DEVICE_CPU>;
151206

207+
template struct lapack_getrf<float, DEVICE_CPU>;
208+
template struct lapack_getrf<double, DEVICE_CPU>;
209+
template struct lapack_getrf<std::complex<float>, DEVICE_CPU>;
210+
template struct lapack_getrf<std::complex<double>, DEVICE_CPU>;
211+
212+
template struct lapack_getri<float, DEVICE_CPU>;
213+
template struct lapack_getri<double, DEVICE_CPU>;
214+
template struct lapack_getri<std::complex<float>, DEVICE_CPU>;
215+
template struct lapack_getri<std::complex<double>, DEVICE_CPU>;
216+
217+
template struct lapack_getrs<float, DEVICE_CPU>;
218+
template struct lapack_getrs<double, DEVICE_CPU>;
219+
template struct lapack_getrs<std::complex<float>, DEVICE_CPU>;
220+
template struct lapack_getrs<std::complex<double>, DEVICE_CPU>;
221+
152222
} // namespace kernels
153223
} // namespace container

source/module_base/module_container/ATen/kernels/lapack.h

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,42 @@ struct lapack_dngvd {
6565
Real* eigen_val);
6666
};
6767

68+
69+
template <typename T, typename Device>
70+
struct lapack_getrf {
71+
void operator()(
72+
const int& m,
73+
const int& n,
74+
T* Mat,
75+
const int& lda,
76+
int* ipiv);
77+
};
78+
79+
80+
template <typename T, typename Device>
81+
struct lapack_getri {
82+
void operator()(
83+
const int& n,
84+
T* Mat,
85+
const int& lda,
86+
const int* ipiv,
87+
T* work,
88+
const int& lwork);
89+
};
90+
91+
template <typename T, typename Device>
92+
struct lapack_getrs {
93+
void operator()(
94+
const char& trans,
95+
const int& n,
96+
const int& nrhs,
97+
T* A,
98+
const int& lda,
99+
const int* ipiv,
100+
T* B,
101+
const int& ldb);
102+
};
103+
68104
#if defined(__CUDA) || defined(__ROCM)
69105
// TODO: Use C++ singleton to manage the GPU handles
70106
void createGpuSolverHandle(); // create cusolver handle

0 commit comments

Comments
 (0)