Skip to content

Commit d8045b8

Browse files
authored
Merge branch 'develop' into fix/bpcg-gemm-insteadof-einsum
2 parents 570131e + 74b2954 commit d8045b8

File tree

189 files changed

+2956
-4653
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

189 files changed

+2956
-4653
lines changed

docs/advanced/input_files/input-main.md

Lines changed: 2 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
- [kpar](#kpar)
1212
- [bndpar](#bndpar)
1313
- [latname](#latname)
14-
- [psi\_initializer](#psi_initializer)
1514
- [init\_wfc](#init_wfc)
1615
- [init\_chg](#init_chg)
1716
- [init\_vel](#init_vel)
@@ -93,6 +92,7 @@
9392
- [scf\_os\_stop](#scf_os_stop)
9493
- [scf\_os\_thr](#scf_os_thr)
9594
- [scf\_os\_ndim](#scf_os_ndim)
95+
- [sc\_os\_ndim](#sc_os_ndim)
9696
- [chg\_extrap](#chg_extrap)
9797
- [lspinorb](#lspinorb)
9898
- [noncolin](#noncolin)
@@ -467,7 +467,7 @@
467467
- [abs\_broadening](#abs_broadening)
468468
- [ri\_hartree\_benchmark](#ri_hartree_benchmark)
469469
- [aims\_nbasis](#aims_nbasis)
470-
- [Reduced Density Matrix Functional Theory](#Reduced-Density-Matrix-Functional-Theory)
470+
- [Reduced Density Matrix Functional Theory](#reduced-density-matrix-functional-theory)
471471
- [rdmft](#rdmft)
472472
- [rdmft\_power\_alpha](#rdmft_power_alpha)
473473

@@ -580,17 +580,6 @@ These variables are used to control general system parameters.
580580
- triclinic: triclinic (14)
581581
- **Default**: none
582582

583-
### psi_initializer
584-
585-
- **Type**: Integer
586-
- **Description**: enable the experimental feature psi_initializer, to support use numerical atomic orbitals initialize wavefunction (`basis_type pw` case).
587-
588-
NOTE: this feature is not well-implemented for `nspin 4` case (closed presently), and cannot use with `calculation nscf`/`esolver_type sdft` cases.
589-
Available options are:
590-
- 0: disable psi_initializer
591-
- 1: enable psi_initializer
592-
- **Default**: 0
593-
594583
### init_wfc
595584

596585
- **Type**: String
@@ -602,8 +591,6 @@ These variables are used to control general system parameters.
602591
- atomic+random: add small random numbers on atomic pseudo-wavefunctions
603592
- file: from binary files `WAVEFUNC*.dat`, which are output by setting [out_wfc_pw](#out_wfc_pw) to `2`.
604593
- random: random numbers
605-
606-
with `psi_initializer 1`, two more options are supported:
607594
- nao: from numerical atomic orbitals. If they are not enough, other wave functions are initialized with random numbers.
608595
- nao+random: add small random numbers on numerical atomic orbitals
609596
- **Default**: atomic

source/Makefile.Objects

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ VPATH=./src_global:\
6969
./module_ri:\
7070
./module_parameter:\
7171
./module_lr:\
72-
./module_lr/AX:\
72+
./module_lr/ao_to_mo_transformer:\
7373
./module_lr/dm_trans:\
7474
./module_lr/operator_casida:\
7575
./module_lr/potentials:\
@@ -189,6 +189,8 @@ OBJS_CELL=atom_pseudo.o\
189189
check_atomic_stru.o\
190190
update_cell.o\
191191
bcast_cell.o\
192+
read_stru.o\
193+
read_atom_species.o\
192194

193195
OBJS_DEEPKS=LCAO_deepks.o\
194196
deepks_force.o\
@@ -398,6 +400,7 @@ OBJS_PSI=psi.o\
398400

399401
OBJS_PSI_INITIALIZER=psi_initializer.o\
400402
psi_initializer_random.o\
403+
psi_initializer_file.o\
401404
psi_initializer_atomic.o\
402405
psi_initializer_atomic_random.o\
403406
psi_initializer_nao.o\
@@ -494,6 +497,7 @@ OBJS_IO=input_conv.o\
494497
to_wannier90_lcao.o\
495498
fR_overlap.o\
496499
unk_overlap_pw.o\
500+
write_pao.o\
497501
write_wfc_pw.o\
498502
winput.o\
499503
write_cube.o\
@@ -669,8 +673,6 @@ OBJS_SRCPW=H_Ewald_pw.o\
669673
of_stress_pw.o\
670674
symmetry_rho.o\
671675
symmetry_rhog.o\
672-
wavefunc.o\
673-
wf_atomic.o\
674676
psi_init.o\
675677
elecond.o\
676678
sto_tool.o\
@@ -723,8 +725,8 @@ OBJS_TENSOR=tensor.o\
723725

724726
OBJS_LR=lr_util.o\
725727
lr_util_hcontainer.o\
726-
AX_parallel.o\
727-
AX_serial.o\
728+
ao_to_mo_parallel.o\
729+
ao_to_mo_serial.o\
728730
dm_trans_parallel.o\
729731
dm_trans_serial.o\
730732
dmr_complex.o\

source/module_base/blas_connector.cpp

Lines changed: 125 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,36 @@ namespace BlasUtils{
5050
return CUBLAS_OP_N;
5151
}
5252

53+
cublasSideMode_t judge_side(const char& trans)
54+
{
55+
if (trans == 'L')
56+
{
57+
return CUBLAS_SIDE_LEFT;
58+
}
59+
else if (trans == 'R')
60+
{
61+
return CUBLAS_SIDE_RIGHT;
62+
}
63+
return CUBLAS_SIDE_LEFT;
64+
}
65+
66+
cublasFillMode_t judge_fill(const char& trans)
67+
{
68+
if (trans == 'F')
69+
{
70+
return CUBLAS_FILL_MODE_FULL;
71+
}
72+
else if (trans == 'U')
73+
{
74+
return CUBLAS_FILL_MODE_UPPER;
75+
}
76+
else if (trans == 'D')
77+
{
78+
return CUBLAS_FILL_MODE_LOWER;
79+
}
80+
return CUBLAS_FILL_MODE_FULL;
81+
}
82+
5383
} // namespace BlasUtils
5484

5585
#endif
@@ -398,6 +428,13 @@ void BlasConnector::symm_cm(const char side, const char uplo, const int m, const
398428
&alpha, a, &lda, b, &ldb,
399429
&beta, c, &ldc);
400430
}
431+
else if (device_type == base_device::AbacusDevice_t::GpuDevice){
432+
#ifdef __CUDA
433+
cublasSideMode_t sideMode = BlasUtils::judge_side(side);
434+
cublasFillMode_t fillMode = BlasUtils::judge_fill(uplo);
435+
cublasErrcheck(cublasSsymm(BlasUtils::cublas_handle, sideMode, fillMode, m, n, &alpha, a, lda, b, ldb, &beta, c, ldc));
436+
#endif
437+
}
401438
}
402439

403440
void BlasConnector::symm_cm(const char side, const char uplo, const int m, const int n,
@@ -409,6 +446,13 @@ void BlasConnector::symm_cm(const char side, const char uplo, const int m, const
409446
&alpha, a, &lda, b, &ldb,
410447
&beta, c, &ldc);
411448
}
449+
else if (device_type == base_device::AbacusDevice_t::GpuDevice){
450+
#ifdef __CUDA
451+
cublasSideMode_t sideMode = BlasUtils::judge_side(side);
452+
cublasFillMode_t fillMode = BlasUtils::judge_fill(uplo);
453+
cublasErrcheck(cublasDsymm(BlasUtils::cublas_handle, sideMode, fillMode, m, n, &alpha, a, lda, b, ldb, &beta, c, ldc));
454+
#endif
455+
}
412456
}
413457

414458
void BlasConnector::symm_cm(const char side, const char uplo, const int m, const int n,
@@ -420,6 +464,13 @@ void BlasConnector::symm_cm(const char side, const char uplo, const int m, const
420464
&alpha, a, &lda, b, &ldb,
421465
&beta, c, &ldc);
422466
}
467+
else if (device_type == base_device::AbacusDevice_t::GpuDevice){
468+
#ifdef __CUDA
469+
cublasSideMode_t sideMode = BlasUtils::judge_side(side);
470+
cublasFillMode_t fillMode = BlasUtils::judge_fill(uplo);
471+
cublasErrcheck(cublasCsymm(BlasUtils::cublas_handle, sideMode, fillMode, m, n, (float2*)&alpha, (float2*)a, lda, (float2*)b, ldb, (float2*)&beta, (float2*)c, ldc));
472+
#endif
473+
}
423474
}
424475

425476
void BlasConnector::symm_cm(const char side, const char uplo, const int m, const int n,
@@ -431,6 +482,13 @@ void BlasConnector::symm_cm(const char side, const char uplo, const int m, const
431482
&alpha, a, &lda, b, &ldb,
432483
&beta, c, &ldc);
433484
}
485+
else if (device_type == base_device::AbacusDevice_t::GpuDevice){
486+
#ifdef __CUDA
487+
cublasSideMode_t sideMode = BlasUtils::judge_side(side);
488+
cublasFillMode_t fillMode = BlasUtils::judge_fill(uplo);
489+
cublasErrcheck(cublasZsymm(BlasUtils::cublas_handle, sideMode, fillMode, m, n, (double2*)&alpha, (double2*)a, lda, (double2*)b, ldb, (double2*)&beta, (double2*)c, ldc));
490+
#endif
491+
}
434492
}
435493

436494
void BlasConnector::hemm_cm(char side, char uplo, int m, int n,
@@ -442,6 +500,13 @@ void BlasConnector::hemm_cm(char side, char uplo, int m, int n,
442500
&alpha, a, &lda, b, &ldb,
443501
&beta, c, &ldc);
444502
}
503+
else if (device_type == base_device::AbacusDevice_t::GpuDevice){
504+
#ifdef __CUDA
505+
cublasSideMode_t sideMode = BlasUtils::judge_side(side);
506+
cublasFillMode_t fillMode = BlasUtils::judge_fill(uplo);
507+
cublasErrcheck(cublasChemm(BlasUtils::cublas_handle, sideMode, fillMode, m, n, (float2*)&alpha, (float2*)a, lda, (float2*)b, ldb, (float2*)&beta, (float2*)c, ldc));
508+
#endif
509+
}
445510
}
446511

447512
void BlasConnector::hemm_cm(char side, char uplo, int m, int n,
@@ -453,6 +518,13 @@ void BlasConnector::hemm_cm(char side, char uplo, int m, int n,
453518
&alpha, a, &lda, b, &ldb,
454519
&beta, c, &ldc);
455520
}
521+
else if (device_type == base_device::AbacusDevice_t::GpuDevice){
522+
#ifdef __CUDA
523+
cublasSideMode_t sideMode = BlasUtils::judge_side(side);
524+
cublasFillMode_t fillMode = BlasUtils::judge_fill(uplo);
525+
cublasErrcheck(cublasZhemm(BlasUtils::cublas_handle, sideMode, fillMode, m, n, (double2*)&alpha, (double2*)a, lda, (double2*)b, ldb, (double2*)&beta, (double2*)c, ldc));
526+
#endif
527+
}
456528
}
457529

458530
void BlasConnector::gemv(const char trans, const int m, const int n,
@@ -461,7 +533,13 @@ void BlasConnector::gemv(const char trans, const int m, const int n,
461533
{
462534
if (device_type == base_device::AbacusDevice_t::CpuDevice) {
463535
sgemv_(&trans, &m, &n, &alpha, A, &lda, X, &incx, &beta, Y, &incy);
464-
}
536+
}
537+
else if (device_type == base_device::AbacusDevice_t::GpuDevice){
538+
#ifdef __CUDA
539+
cublasOperation_t cutransA = BlasUtils::judge_trans(false, trans, "gemv_op");
540+
cublasErrcheck(cublasSgemv(BlasUtils::cublas_handle, cutransA, m, n, &alpha, A, lda, X, incx, &beta, Y, incy));
541+
#endif
542+
}
465543
}
466544

467545
void BlasConnector::gemv(const char trans, const int m, const int n,
@@ -470,7 +548,13 @@ void BlasConnector::gemv(const char trans, const int m, const int n,
470548
{
471549
if (device_type == base_device::AbacusDevice_t::CpuDevice) {
472550
dgemv_(&trans, &m, &n, &alpha, A, &lda, X, &incx, &beta, Y, &incy);
473-
}
551+
}
552+
else if (device_type == base_device::AbacusDevice_t::GpuDevice){
553+
#ifdef __CUDA
554+
cublasOperation_t cutransA = BlasUtils::judge_trans(false, trans, "gemv_op");
555+
cublasErrcheck(cublasDgemv(BlasUtils::cublas_handle, cutransA, m, n, &alpha, A, lda, X, incx, &beta, Y, incy));
556+
#endif
557+
}
474558
}
475559

476560
void BlasConnector::gemv(const char trans, const int m, const int n,
@@ -479,7 +563,15 @@ void BlasConnector::gemv(const char trans, const int m, const int n,
479563
{
480564
if (device_type == base_device::AbacusDevice_t::CpuDevice) {
481565
cgemv_(&trans, &m, &n, &alpha, A, &lda, X, &incx, &beta, Y, &incy);
482-
}
566+
}
567+
else if (device_type == base_device::AbacusDevice_t::GpuDevice){
568+
#ifdef __CUDA
569+
cuFloatComplex alpha_cu = make_cuFloatComplex(alpha.real(), alpha.imag());
570+
cuFloatComplex beta_cu = make_cuFloatComplex(beta.real(), beta.imag());
571+
cublasOperation_t cutransA = BlasUtils::judge_trans(true, trans, "gemv_op");
572+
cublasErrcheck(cublasCgemv(BlasUtils::cublas_handle, cutransA, m, n, &alpha_cu, (cuFloatComplex*)A, lda, (cuFloatComplex*)X, incx, &beta_cu, (cuFloatComplex*)Y, incy));
573+
#endif
574+
}
483575
}
484576

485577
void BlasConnector::gemv(const char trans, const int m, const int n,
@@ -488,7 +580,15 @@ void BlasConnector::gemv(const char trans, const int m, const int n,
488580
{
489581
if (device_type == base_device::AbacusDevice_t::CpuDevice) {
490582
zgemv_(&trans, &m, &n, &alpha, A, &lda, X, &incx, &beta, Y, &incy);
491-
}
583+
}
584+
else if (device_type == base_device::AbacusDevice_t::GpuDevice){
585+
#ifdef __CUDA
586+
cuDoubleComplex alpha_cu = make_cuDoubleComplex(alpha.real(), alpha.imag());
587+
cuDoubleComplex beta_cu = make_cuDoubleComplex(beta.real(), beta.imag());
588+
cublasOperation_t cutransA = BlasUtils::judge_trans(true, trans, "gemv_op");
589+
cublasErrcheck(cublasZgemv(BlasUtils::cublas_handle, cutransA, m, n, &alpha_cu, (cuDoubleComplex*)A, lda, (cuDoubleComplex*)X, incx, &beta_cu, (cuDoubleComplex*)Y, incy));
590+
#endif
591+
}
492592
}
493593

494594
// out = ||x||_2
@@ -497,6 +597,13 @@ float BlasConnector::nrm2( const int n, const float *X, const int incX, base_dev
497597
if (device_type == base_device::AbacusDevice_t::CpuDevice) {
498598
return snrm2_( &n, X, &incX );
499599
}
600+
else if (device_type == base_device::AbacusDevice_t::GpuDevice){
601+
#ifdef __CUDA
602+
float result = 0.0;
603+
cublasErrcheck(cublasSnrm2(BlasUtils::cublas_handle, n, X, incX, &result));
604+
return result;
605+
#endif
606+
}
500607
return snrm2_( &n, X, &incX );
501608
}
502609

@@ -506,6 +613,13 @@ double BlasConnector::nrm2( const int n, const double *X, const int incX, base_d
506613
if (device_type == base_device::AbacusDevice_t::CpuDevice) {
507614
return dnrm2_( &n, X, &incX );
508615
}
616+
else if (device_type == base_device::AbacusDevice_t::GpuDevice){
617+
#ifdef __CUDA
618+
double result = 0.0;
619+
cublasErrcheck(cublasDnrm2(BlasUtils::cublas_handle, n, X, incX, &result));
620+
return result;
621+
#endif
622+
}
509623
return dnrm2_( &n, X, &incX );
510624
}
511625

@@ -515,6 +629,13 @@ double BlasConnector::nrm2( const int n, const std::complex<double> *X, const in
515629
if (device_type == base_device::AbacusDevice_t::CpuDevice) {
516630
return dznrm2_( &n, X, &incX );
517631
}
632+
else if (device_type == base_device::AbacusDevice_t::GpuDevice){
633+
#ifdef __CUDA
634+
double result = 0.0;
635+
cublasErrcheck(cublasDznrm2(BlasUtils::cublas_handle, n, (double2*)X, incX, &result));
636+
return result;
637+
#endif
638+
}
518639
return dznrm2_( &n, X, &incX );
519640
}
520641

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
AddTest(
22
TARGET container_ops_uts
33
LIBS parameter ${math_libs}
4-
SOURCES einsum_op_test.cpp linalg_op_test.cpp
4+
SOURCES einsum_op_test.cpp linalg_op_test.cpp ../../kernels/lapack.cpp
55
)
66

77
target_link_libraries(container_ops_uts container base device)

source/module_basis/module_pw/pw_basis_k.cpp

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@ PW_Basis_K::~PW_Basis_K()
2222
delete[] igl2isz_k;
2323
delete[] igl2ig_k;
2424
delete[] gk2;
25-
delete[] ig2ixyz_k_;
2625
#if defined(__CUDA) || defined(__ROCM)
2726
if (this->device == "gpu") {
2827
if (this->precision == "single") {
@@ -169,6 +168,7 @@ void PW_Basis_K::setupIndGk()
169168
syncmem_int_h2d_op()(gpu_ctx, cpu_ctx, this->d_igl2isz_k, this->igl2isz_k, this->npwk_max * this->nks);
170169
}
171170
#endif
171+
this->get_ig2ixyz_k();
172172
return;
173173
}
174174

@@ -334,8 +334,12 @@ int& PW_Basis_K::getigl2ig(const int ik, const int igl) const
334334

335335
void PW_Basis_K::get_ig2ixyz_k()
336336
{
337-
delete[] this->ig2ixyz_k_;
338-
this->ig2ixyz_k_ = new int [this->npwk_max * this->nks];
337+
if (this->device != "gpu")
338+
{
339+
//only GPU need to get ig2ixyz_k
340+
return;
341+
}
342+
int * ig2ixyz_k_cpu = new int [this->npwk_max * this->nks];
339343
ModuleBase::Memory::record("PW_B_K::ig2ixyz", sizeof(int) * this->npwk_max * this->nks);
340344
assert(gamma_only == false); //We only finish non-gamma_only fft on GPU temperarily.
341345
for(int ik = 0; ik < this->nks; ++ik)
@@ -348,15 +352,12 @@ void PW_Basis_K::get_ig2ixyz_k()
348352
int ixy = this->is2fftixy[is];
349353
int iy = ixy % this->ny;
350354
int ix = ixy / this->ny;
351-
ig2ixyz_k_[igl + ik * npwk_max] = iz + iy * nz + ix * ny * nz;
355+
ig2ixyz_k_cpu[igl + ik * npwk_max] = iz + iy * nz + ix * ny * nz;
352356
}
353357
}
354-
#if defined(__CUDA) || defined(__ROCM)
355-
if (this->device == "gpu") {
356-
resmem_int_op()(gpu_ctx, ig2ixyz_k, this->npwk_max * this->nks);
357-
syncmem_int_h2d_op()(gpu_ctx, cpu_ctx, this->ig2ixyz_k, this->ig2ixyz_k_, this->npwk_max * this->nks);
358-
}
359-
#endif
358+
resmem_int_op()(gpu_ctx, ig2ixyz_k, this->npwk_max * this->nks);
359+
syncmem_int_h2d_op()(gpu_ctx, cpu_ctx, this->ig2ixyz_k, ig2ixyz_k_cpu, this->npwk_max * this->nks);
360+
delete[] ig2ixyz_k_cpu;
360361
}
361362

362363
std::vector<int> PW_Basis_K::get_ig2ix(const int ik) const

0 commit comments

Comments
 (0)