Skip to content

Commit a6cc224

Browse files
committed
Feature: implement k continuity initialization strategy & kernel
optimization in davidson-subspcae algorithm - add k continuity initialization strategy in planewave basis - implement heterogenous computation branching between CPU and DCU - implement optimized eigenvalue operations for GPU & DCU - implement optimized preconditioner for GPU & DCU - implement optimized normalization op for GPU & DCU
1 parent 8fc7c42 commit a6cc224

File tree

16 files changed

+3958
-144
lines changed

16 files changed

+3958
-144
lines changed

source/module_base/kernels/math_kernel_op.h

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -284,6 +284,48 @@ template <typename T, typename Device> struct matrixCopy {
284284
void operator()(const int& n1, const int& n2, const T* A, const int& LDA, T* B, const int& LDB);
285285
};
286286

287+
template <typename T, typename Device>
288+
struct apply_eigenvalues_op {
289+
using Real = typename GetTypeReal<T>::type;
290+
291+
void operator()(const Device *d, const int &nbase, const int &nbase_x, const int &notconv,
292+
T *result, const T *vectors, const Real *eigenvalues);
293+
};
294+
295+
template <typename T, typename Device>
296+
struct precondition_op {
297+
using Real = typename GetTypeReal<T>::type;
298+
void operator()(const Device* d,
299+
const int& dim,
300+
T* psi_iter,
301+
const int& nbase,
302+
const int& notconv,
303+
const Real* precondition,
304+
const Real* eigenvalues);
305+
};
306+
307+
template <typename T, typename Device>
308+
struct normalize_op {
309+
using Real = typename GetTypeReal<T>::type;
310+
void operator()(const Device* d,
311+
const int& dim,
312+
T* psi_iter,
313+
const int& nbase,
314+
const int& notconv,
315+
Real* psi_norm = nullptr);
316+
};
317+
318+
template <typename T>
319+
struct normalize_op<T, base_device::DEVICE_GPU> {
320+
using Real = typename GetTypeReal<T>::type;
321+
void operator()(const base_device::DEVICE_GPU* d,
322+
const int& dim,
323+
T* psi_iter,
324+
const int& nbase,
325+
const int& notconv,
326+
Real* psi_norm);
327+
};
328+
287329
#if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
288330
// Partially specialize functor for base_device::GpuDevice.
289331
template <typename T> struct dot_real_op<T, base_device::DEVICE_GPU> {
@@ -334,6 +376,26 @@ template <typename T> struct matrixCopy<T, base_device::DEVICE_GPU> {
334376
void createGpuBlasHandle();
335377
void destoryBLAShandle();
336378

379+
// vector operator: result[i] = -lambda[i] * vector[i]
380+
template <typename T> struct apply_eigenvalues_op<T, base_device::DEVICE_GPU> {
381+
using Real = typename GetTypeReal<T>::type;
382+
383+
void operator()(const base_device::DEVICE_GPU *d, const int &nbase, const int &nbase_x, const int &notconv,
384+
T *result, const T *vectors, const Real *eigenvalues);
385+
};
386+
387+
template <typename T>
388+
struct precondition_op<T, base_device::DEVICE_GPU> {
389+
using Real = typename GetTypeReal<T>::type;
390+
void operator()(const base_device::DEVICE_GPU* d,
391+
const int& dim,
392+
T* psi_iter,
393+
const int& nbase,
394+
const int& notconv,
395+
const Real* precondition,
396+
const Real* eigenvalues);
397+
};
398+
337399
#endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
338400
} // namespace hsolver
339401

source/module_base/module_device/device.cpp

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
#include <base/macros/macros.h>
77
#include <cstring>
8-
8+
#include <iostream>
99
#ifdef __MPI
1010
#include "mpi.h"
1111
#endif
@@ -166,6 +166,11 @@ int device_count = -1;
166166
cudaGetDeviceCount(&device_count);
167167
#elif defined(__ROCM)
168168
hipGetDeviceCount(&device_count);
169+
/***auto start_time = std::chrono::high_resolution_clock::now();
170+
std::cout << "Starting hipGetDeviceCount.." << std::endl;
171+
auto end_time = std::chrono::high_resolution_clock::now();
172+
auto duration = std::chrono::duration_cast<std::chrono::duration<double>>(end_time - start_time);
173+
std::cout << "hipGetDeviceCount took " << duration.count() << "seconds" << std::endl;***/
169174
#endif
170175
if (device_count <= 0)
171176
{
@@ -711,4 +716,4 @@ void record_device_memory<base_device::DEVICE_GPU>(
711716
#endif
712717

713718
} // end of namespace information
714-
} // end of namespace base_device
719+
} // end of namespace base_device

source/module_esolver/esolver.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -114,8 +114,10 @@ std::string determine_type()
114114
}
115115

116116
GlobalV::ofs_running << "\n RUNNING WITH DEVICE : " << device_info << " / "
117-
<< base_device::information::get_device_info(PARAM.inp.device) << std::endl;
118-
117+
<< base_device::information::get_device_info(PARAM.inp.device) << std::endl;
118+
/***auto end_time = std::chrono::high_resolution_clock::now();
119+
auto duration = std::chrono::duration_cast<std::chrono::duration<double>>(end_time - start_time);
120+
std::cout << "hipGetDeviceInfo took " << duration.count() << " seconds" << std::endl;***/
119121
return esolver_type;
120122
}
121123

source/module_esolver/esolver_ks_pw.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -572,7 +572,8 @@ void ESolver_KS_PW<T, Device>::hamilt2rho_single(UnitCell& ucell,
572572
hsolver::DiagoIterAssist<T, Device>::SCF_ITER,
573573
hsolver::DiagoIterAssist<T, Device>::PW_DIAG_NMAX,
574574
hsolver::DiagoIterAssist<T, Device>::PW_DIAG_THR,
575-
hsolver::DiagoIterAssist<T, Device>::need_subspace);
575+
hsolver::DiagoIterAssist<T, Device>::need_subspace,
576+
PARAM.inp.use_k_continuity);
576577

577578
hsolver_pw_obj.solve(this->p_hamilt,
578579
this->kspw_psi[0],

source/module_hsolver/diago_dav_subspace.cpp

Lines changed: 67 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -293,26 +293,28 @@ void Diago_DavSubspace<T, Device>::cal_grad(const HPsiFunc& hpsi_func,
293293
psi_iter + (nbase) * this->dim,
294294
this->dim);
295295

296-
std::vector<Real> e_temp_cpu(nbase, 0);
296+
// Eigenvalues operation section
297+
std::vector<Real> e_temp_cpu(this->notconv, 0);
297298
Real* e_temp_hd = e_temp_cpu.data();
298-
if(this->device == base_device::GpuDevice)
299+
if (this->device == base_device::GpuDevice)
299300
{
300301
e_temp_hd = nullptr;
301-
resmem_real_op()(e_temp_hd, nbase);
302+
resmem_real_op()(this->ctx, e_temp_hd, this->notconv);
302303
}
303-
for (int m = 0; m < notconv; m++)
304+
305+
for (int m = 0; m < this->notconv; m++)
304306
{
305-
e_temp_cpu.assign(nbase, (-1.0 * (*eigenvalue_iter)[m]));
306-
if (this->device == base_device::GpuDevice)
307-
{
308-
syncmem_var_h2d_op()(e_temp_hd, e_temp_cpu.data(), nbase);
309-
}
310-
ModuleBase::vector_mul_vector_op<T, Device>()(nbase,
311-
vcc + m * this->nbase_x,
312-
vcc + m * this->nbase_x,
313-
e_temp_hd);
307+
e_temp_cpu[m] = -(*eigenvalue_iter)[m];
314308
}
315-
if(this->device == base_device::GpuDevice)
309+
310+
if (this->device == base_device::GpuDevice)
311+
{
312+
syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, e_temp_hd, e_temp_cpu.data(), this->notconv);
313+
}
314+
315+
apply_eigenvalues_op<T, Device>()(this->ctx, nbase, this->nbase_x, this->notconv, this->vcc, this->vcc, e_temp_hd);
316+
317+
if (this->device == base_device::GpuDevice)
316318
{
317319
delmem_real_op()(e_temp_hd);
318320
}
@@ -336,48 +338,62 @@ void Diago_DavSubspace<T, Device>::cal_grad(const HPsiFunc& hpsi_func,
336338
psi_iter + nbase * this->dim,
337339
this->dim);
338340

339-
// "precondition!!!"
340-
std::vector<Real> pre(this->dim, 0.0);
341-
for (int m = 0; m < notconv; m++)
342-
{
343-
for (size_t i = 0; i < this->dim; i++)
344-
{
345-
// pre[i] = std::abs(this->precondition[i] - (*eigenvalue_iter)[m]);
346-
double x = std::abs(this->precondition[i] - (*eigenvalue_iter)[m]);
347-
pre[i] = 0.5 * (1.0 + x + sqrt(1 + (x - 1.0) * (x - 1.0)));
348-
}
341+
// Precondition section
349342
#if defined(__CUDA) || defined(__ROCM)
350-
if (this->device == base_device::GpuDevice)
351-
{
352-
syncmem_var_h2d_op()(this->d_precondition, pre.data(), this->dim);
353-
ModuleBase::vector_div_vector_op<T, Device>()(this->dim,
354-
psi_iter + (nbase + m) * this->dim,
355-
psi_iter + (nbase + m) * this->dim,
356-
this->d_precondition);
357-
}
358-
else
343+
if (this->device == base_device::GpuDevice)
344+
{
345+
Real* eigenvalues_gpu = nullptr;
346+
resmem_real_op()(this->ctx, eigenvalues_gpu, notconv);
347+
syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, eigenvalues_gpu,(*eigenvalue_iter).data(), notconv);
348+
349+
precondition_op<T, Device>()(this->ctx,
350+
this->dim,
351+
psi_iter,
352+
nbase,
353+
notconv,
354+
d_precondition,
355+
eigenvalues_gpu);
356+
delmem_real_op()(this->ctx, eigenvalues_gpu);
357+
}
358+
else
359359
#endif
360-
{
361-
ModuleBase::vector_div_vector_op<T, Device>()(this->dim,
362-
psi_iter + (nbase + m) * this->dim,
363-
psi_iter + (nbase + m) * this->dim,
364-
pre.data());
365-
}
360+
{
361+
precondition_op<T, Device>()(this->ctx,
362+
this->dim,
363+
psi_iter,
364+
nbase,
365+
notconv,
366+
this->precondition.data(),
367+
(*eigenvalue_iter).data());
366368
}
367369

368-
// "normalize!!!" in order to improve numerical stability of subspace diagonalization
369-
for (size_t i = 0; i < notconv; i++)
370+
// Normalize section
371+
#if defined(__CUDA) || defined(__ROCM)
372+
if (this->device == base_device::GpuDevice)
373+
{
374+
Real* psi_norm = nullptr;
375+
resmem_real_op()(this->ctx, psi_norm, notconv);
376+
using setmem_real_op = base_device::memory::set_memory_op<Real, Device>;
377+
setmem_real_op()(this->ctx, psi_norm, 0.0, notconv);
378+
379+
normalize_op<T, Device>()(this->ctx,
380+
this->dim,
381+
psi_iter,
382+
nbase,
383+
notconv,
384+
psi_norm);
385+
delmem_real_op()(this->ctx, psi_norm);
386+
}
387+
else
388+
#endif
370389
{
371-
Real psi_norm = ModuleBase::dot_real_op<T, Device>()(this->dim,
372-
psi_iter + (nbase + i) * this->dim,
373-
psi_iter + (nbase + i) * this->dim,
374-
true);
375-
assert(psi_norm > 0.0);
376-
psi_norm = sqrt(psi_norm);
377-
ModuleBase::vector_mul_real_op<T, Device>()(this->dim,
378-
psi_iter + (nbase + i) * this->dim,
379-
psi_iter + (nbase + i) * this->dim,
380-
Real(1.0 / psi_norm));
390+
Real* psi_norm = nullptr;
391+
normalize_op<T, Device>()(this->ctx,
392+
this->dim,
393+
psi_iter,
394+
nbase,
395+
notconv,
396+
psi_norm);
381397
}
382398

383399
// update hpsi[:, nbase:nbase+notconv]

0 commit comments

Comments
 (0)