Skip to content

Commit b9bb679

Browse files
committed
Perf: reduce memory allocation and copy in Diago_DavSubspace::diag_zhegvx
Signed-off-by:Tianxiang Wang<[email protected]>, Contributed under MetaX Integrated Circuits (Shanghai) Co., Ltd.
1 parent 1bd9581 commit b9bb679

File tree

4 files changed

+13
-35
lines changed

4 files changed

+13
-35
lines changed

source/source_hsolver/diago_dav_subspace.cpp

Lines changed: 5 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ Diago_DavSubspace<T, Device>::Diago_DavSubspace(const std::vector<Real>& precond
7676
{
7777
resmem_real_op()(this->d_precondition, nbasis_in);
7878
// syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, this->d_precondition, this->precondition.data(), nbasis_in);
79+
base_device::memory::resize_memory_op<T, Device>()(this->d_scc, this->nbase_x * this->nbase_x);
7980
resmem_real_op()(this->d_eigenvalue, this->nbase_x);
8081
}
8182
#endif
@@ -95,6 +96,7 @@ Diago_DavSubspace<T, Device>::~Diago_DavSubspace()
9596
if (this->device == base_device::GpuDevice)
9697
{
9798
delmem_real_op()(this->d_precondition);
99+
delmem_complex_op()(this->d_scc);
98100
delmem_real_op()(this->d_eigenvalue);
99101
}
100102
#endif
@@ -546,34 +548,9 @@ void Diago_DavSubspace<T, Device>::diag_zhegvx(const int& nbase,
546548
#if defined(__CUDA) || defined(__ROCM)
547549
if (this->diag_comm.rank == 0)
548550
{
549-
Real* eigenvalue_gpu = nullptr;
550-
resmem_real_op()(eigenvalue_gpu, this->nbase_x);
551-
552-
syncmem_var_h2d_op()(eigenvalue_gpu, (*eigenvalue_iter).data(), this->nbase_x);
553-
554-
T* hcc_gpu = nullptr;
555-
T* scc_gpu = nullptr;
556-
T* vcc_gpu = nullptr;
557-
base_device::memory::resize_memory_op<T, Device>()(hcc_gpu, nbase * nbase);
558-
base_device::memory::resize_memory_op<T, Device>()(scc_gpu, nbase * nbase);
559-
base_device::memory::resize_memory_op<T, Device>()(vcc_gpu, nbase * nbase);
560-
for(int i=0;i<nbase;i++)
561-
{
562-
base_device::memory::synchronize_memory_op<T, Device, Device>()(hcc_gpu + i * nbase, hcc + i * nbase_x, nbase);
563-
base_device::memory::synchronize_memory_op<T, Device, Device>()(scc_gpu + i * nbase, scc + i * nbase_x, nbase);
564-
}
565-
dngvd_op<T, Device>()(this->ctx, nbase, nbase, hcc_gpu, scc_gpu, eigenvalue_gpu, vcc_gpu);
566-
for(int i=0;i<nbase;i++)
567-
{
568-
base_device::memory::synchronize_memory_op<T, Device, Device>()(vcc + i * nbase_x, vcc_gpu + i * nbase, nbase);
569-
}
570-
delmem_complex_op()(hcc_gpu);
571-
delmem_complex_op()(scc_gpu);
572-
delmem_complex_op()(vcc_gpu);
573-
574-
syncmem_var_d2h_op()((*eigenvalue_iter).data(), eigenvalue_gpu, this->nbase_x);
575-
576-
delmem_real_op()(eigenvalue_gpu);
551+
base_device::memory::synchronize_memory_op<T, Device, Device>()(this->d_scc, scc, nbase * this->nbase_x);
552+
dngvd_op<T, Device>()(this->ctx, nbase, this->nbase_x, this->hcc, this->d_scc, this->d_eigenvalue, this->vcc);
553+
syncmem_var_d2h_op()((*eigenvalue_iter).data(), this->d_eigenvalue, this->nbase_x);
577554
}
578555
#endif
579556
}

source/source_hsolver/diago_dav_subspace.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ class Diago_DavSubspace
9494
/// Eigenvectors on the reduced basis
9595
T* vcc = nullptr;
9696

97+
T* d_scc = nullptr;
9798
Real* d_eigenvalue = nullptr;
9899

99100
/// device type of psi

source/source_hsolver/kernels/cuda/dngvd_op.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -216,7 +216,7 @@ struct dngvd_op<T, base_device::DEVICE_GPU>
216216
Real* W, // eigenvalue
217217
T* V)
218218
{
219-
assert(nstart == ldh);
219+
// assert(nstart == ldh);
220220
// A to V
221221
cudaErrcheck(cudaMemcpy(V, A, sizeof(T) * ldh * nstart, cudaMemcpyDeviceToDevice));
222222
xhegvd_wrapper(CUBLAS_FILL_MODE_UPPER, nstart, V, ldh,

source/source_hsolver/kernels/rocm/dngvd_op.hip.cu

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ namespace hsolver {
88
// NOTE: mimicked from ../cuda/dngvd_op.cu for three dngvd_op
99

1010
static hipsolverHandle_t hipsolver_H = nullptr;
11-
// Test on DCU platform. When nstart is greater than 234, code on DCU performs better.
11+
// Test on DCU platform. When nstart is greater than 234, code on DCU performs better.
1212
const int N_DCU = 234;
1313

1414
void createGpuSolverHandle() {
@@ -97,7 +97,7 @@ void dngvd_op<double, base_device::DEVICE_GPU>::operator()(const base_device::DE
9797
hipErrcheck(hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(double) * eigenvalue.size(), hipMemcpyHostToDevice));
9898
}
9999

100-
100+
101101
}
102102
#endif // __LCAO
103103

@@ -112,7 +112,7 @@ void dngvd_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const ba
112112
{
113113
// copied from ../cuda/dngvd_op.cu, "dngvd_op"
114114
assert(nstart == ldh);
115-
115+
116116
if (nstart > N_DCU){
117117
hipErrcheck(hipMemcpy(_vcc, _hcc, sizeof(std::complex<float>) * ldh * nstart, hipMemcpyDeviceToDevice));
118118
// now vcc contains hcc
@@ -170,7 +170,7 @@ void dngvd_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const ba
170170
hipErrcheck(hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(float) * eigenvalue.size(), hipMemcpyHostToDevice));
171171
}
172172

173-
173+
174174
}
175175

176176
template <>
@@ -184,7 +184,7 @@ void dngvd_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(const b
184184
)
185185
{
186186
// copied from ../cuda/dngvd_op.cu, "dngvd_op"
187-
assert(nstart == ldh);
187+
// assert(nstart == ldh);
188188

189189
// save a copy of scc in case the diagonalization fails
190190
if (nstart > N_DCU){
@@ -253,7 +253,7 @@ void dngvd_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(const b
253253

254254

255255

256-
256+
257257
}
258258

259259
#ifdef __LCAO

0 commit comments

Comments
 (0)