Skip to content

Commit bc1fdfc

Browse files
authored
Merge branch 'LTS' into xc_output_LTS
2 parents 5f761c7 + e59bbdd commit bc1fdfc

File tree

27 files changed

+2091
-929
lines changed

27 files changed

+2091
-929
lines changed

CMakeLists.txt

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,9 @@ set(ABACUS_BIN_PATH ${CMAKE_CURRENT_BINARY_DIR}/${ABACUS_BIN_NAME})
153153
include_directories(${ABACUS_SOURCE_DIR})
154154
include_directories(${ABACUS_SOURCE_DIR}/module_base/module_container)
155155

156-
set(CMAKE_CXX_STANDARD 11)
156+
if(NOT DEFINED CMAKE_CXX_STANDARD)
157+
set(CMAKE_CXX_STANDARD 11)
158+
endif()
157159
set(CMAKE_CXX_STANDARD_REQUIRED ON)
158160

159161
add_executable(${ABACUS_BIN_NAME} source/main.cpp)
@@ -293,22 +295,33 @@ endif()
293295

294296
if(USE_CUDA)
295297
cmake_minimum_required(VERSION 3.18) # required by `CUDA_ARCHITECTURES` below
298+
find_package(CUDAToolkit REQUIRED)
296299
set_if_higher(CMAKE_CXX_STANDARD 14)
300+
if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL "13.0")
301+
message(STATUS "CUDA ${CUDAToolkit_VERSION} detected. Setting CMAKE_CUDA_STANDARD to 17.")
302+
set_if_higher(CMAKE_CXX_STANDARD 17)
303+
endif()
297304
set(CMAKE_CXX_EXTENSIONS ON)
298305
set(CMAKE_CUDA_STANDARD ${CMAKE_CXX_STANDARD})
299306
set(CMAKE_CUDA_STANDARD_REQUIRED ON)
300307
set(CMAKE_CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER})
301308
if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
302-
find_package(CUDAToolkit REQUIRED)
303309
# check
304310
# https://gitlab.kitware.com/cmake/cmake/-/blob/master/Modules/Internal/CMakeCUDAArchitecturesAll.cmake
305-
# for available architechures in different CUDA versions
306-
set(CMAKE_CUDA_ARCHITECTURES
307-
60 # P100
308-
70 # V100
309-
# Add your CUDA arch here Check the Compute Capability version of your
310-
# GPU at: https://en.wikipedia.org/wiki/CUDA#GPUs_supported
311-
)
311+
# for available architectures in different CUDA versions
312+
# CUDA 13.0+ dropped support for architectures below 75
313+
if(CUDAToolkit_VERSION VERSION_LESS "13.0")
314+
set(CMAKE_CUDA_ARCHITECTURES
315+
60 # P100
316+
70 # V100
317+
)
318+
else()
319+
# Start with empty list; architectures 75+ will be added below
320+
set(CMAKE_CUDA_ARCHITECTURES)
321+
endif()
322+
323+
# Add your CUDA arch here Check the Compute Capability version of your
324+
# GPU at: https://en.wikipedia.org/wiki/CUDA#GPUs_supported
312325
if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 10.0)
313326
list(APPEND CMAKE_CUDA_ARCHITECTURES 75) # T4
314327
endif()

docs/advanced/input_files/input-main.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
- [pw\_seed](#pw_seed)
3838
- [pw\_diag\_thr](#pw_diag_thr)
3939
- [diago\_smooth\_ethr](#diago_smooth_ethr)
40+
- [use\_k\_continuity](#use_k_continuity)
4041
- [pw\_diag\_nmax](#pw_diag_nmax)
4142
- [pw\_diag\_ndim](#pw_diag_ndim)
4243
- [erf\_ecut](#erf_ecut)
@@ -774,6 +775,18 @@ These variables are used to control the plane wave related parameters.
774775
- **Description**: If `TRUE`, the smooth threshold strategy, which applies a larger threshold (10e-5) for the empty states, will be implemented in the diagonalization methods. (This strategy should not affect total energy, forces, and other ground-state properties, but computational efficiency will be improved.) If `FALSE`, the smooth threshold strategy will not be applied.
775776
- **Default**: false
776777

778+
### use_k_continuity
779+
780+
- **Type**: Boolean
781+
- **Availability**: Used only for plane wave basis set.
782+
- **Description**: Whether to use k-point continuity for initializing wave functions. When enabled, this strategy exploits the similarity between wavefunctions at neighboring k-points by propagating the wavefunction from a previously initialized k-point to a new k-point, significantly reducing the computational cost of the initial guess.
783+
784+
**Important constraints:**
785+
- Must be used together with `diago_smooth_ethr = 1` for optimal performance
786+
787+
This feature is particularly useful for calculations with dense k-point sampling where the computational cost of wavefunction initialization becomes significant.
788+
- **Default**: false
789+
777790
### pw_diag_nmax
778791

779792
- **Type**: Integer

source/module_base/module_device/device.cpp

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212

1313
#if defined(__CUDA)
1414
#include <cuda_runtime.h>
15+
#include <cuda.h>
1516
#endif
1617

1718
#if defined(__ROCM)
@@ -299,6 +300,7 @@ void print_device_info<base_device::DEVICE_GPU>(
299300
sprintf(msg, " CUDA Capability Major/Minor version number: %d.%d\n",
300301
deviceProp.major, deviceProp.minor);
301302
ofs_device << msg << std::endl;
303+
#if defined(CUDA_VERSION) && CUDA_VERSION < 13000
302304
sprintf(msg,
303305
" GPU Max Clock rate: %.0f MHz (%0.2f "
304306
"GHz)\n",
@@ -312,6 +314,7 @@ void print_device_info<base_device::DEVICE_GPU>(
312314
sprintf(msg, " Memory Bus Width: %d-bit\n",
313315
deviceProp.memoryBusWidth);
314316
ofs_device << msg << std::endl;
317+
#endif
315318
sprintf(msg,
316319
" Maximum Texture Dimension Size (x,y,z) 1D=(%d), 2D=(%d, "
317320
"%d), 3D=(%d, %d, %d)\n",
@@ -366,6 +369,7 @@ void print_device_info<base_device::DEVICE_GPU>(
366369
sprintf(msg, " Texture alignment: %zu bytes\n",
367370
deviceProp.textureAlignment);
368371
ofs_device << msg << std::endl;
372+
#if defined(CUDA_VERSION) && CUDA_VERSION < 13000
369373
sprintf(msg,
370374
" Concurrent copy and kernel execution: %s with %d copy "
371375
"engine(s)\n",
@@ -375,6 +379,7 @@ void print_device_info<base_device::DEVICE_GPU>(
375379
sprintf(msg, " Run time limit on kernels: %s\n",
376380
deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No");
377381
ofs_device << msg << std::endl;
382+
#endif
378383
sprintf(msg, " Integrated GPU sharing Host Memory: %s\n",
379384
deviceProp.integrated ? "Yes" : "No");
380385
ofs_device << msg << std::endl;
@@ -399,13 +404,15 @@ void print_device_info<base_device::DEVICE_GPU>(
399404
sprintf(msg, " Supports Cooperative Kernel Launch: %s\n",
400405
deviceProp.cooperativeLaunch ? "Yes" : "No");
401406
ofs_device << msg << std::endl;
402-
sprintf(msg, " Supports MultiDevice Co-op Kernel Launch: %s\n",
403-
deviceProp.cooperativeMultiDeviceLaunch ? "Yes" : "No");
404-
ofs_device << msg << std::endl;
405407
sprintf(msg,
406408
" Device PCI Domain ID / Bus ID / location ID: %d / %d / %d\n",
407409
deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);
408410
ofs_device << msg << std::endl;
411+
#if defined(CUDA_VERSION) && CUDA_VERSION < 13000
412+
sprintf(msg, " Supports MultiDevice Co-op Kernel Launch: %s\n",
413+
deviceProp.cooperativeMultiDeviceLaunch ? "Yes" : "No");
414+
ofs_device << msg << std::endl;
415+
409416
const char *sComputeMode[] = {
410417
"Default (multiple host threads can use ::cudaSetDevice() with device "
411418
"simultaneously)",
@@ -421,7 +428,7 @@ void print_device_info<base_device::DEVICE_GPU>(
421428
ofs_device << msg << std::endl;
422429
ofs_device << " " << sComputeMode[deviceProp.computeMode] << std::endl
423430
<< std::endl;
424-
431+
#endif
425432
// If there are 2 or more GPUs, query to determine whether RDMA is supported
426433
if (deviceCount >= 2) {
427434
cudaDeviceProp prop[64];
@@ -711,4 +718,4 @@ void record_device_memory<base_device::DEVICE_GPU>(
711718
#endif
712719

713720
} // end of namespace information
714-
} // end of namespace base_device
721+
} // end of namespace base_device

source/module_esolver/esolver_ks_pw.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -520,7 +520,8 @@ void ESolver_KS_PW<T, Device>::hamilt2density_single(UnitCell& ucell,
520520
hsolver::DiagoIterAssist<T, Device>::SCF_ITER,
521521
hsolver::DiagoIterAssist<T, Device>::PW_DIAG_NMAX,
522522
hsolver::DiagoIterAssist<T, Device>::PW_DIAG_THR,
523-
hsolver::DiagoIterAssist<T, Device>::need_subspace);
523+
hsolver::DiagoIterAssist<T, Device>::need_subspace,
524+
PARAM.inp.use_k_continuity);
524525

525526
hsolver_pw_obj.solve(this->p_hamilt,
526527
this->kspw_psi[0],

source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/op_exx_lcao.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -240,7 +240,7 @@ OperatorEXX<OperatorLCAO<TK, TR>>::OperatorEXX(HS_Matrix_K<TK>* hsk_in,
240240
else if (this->add_hexx_type == Add_Hexx_Type::R)
241241
{
242242
// read in Hexx(R)
243-
const std::string restart_HR_path = PARAM.globalv.global_readin_dir + "HexxR" + std::to_string(PARAM.globalv.myrank);
243+
const std::string restart_HR_path = GlobalC::restart.folder + "HexxR" + std::to_string(PARAM.globalv.myrank);
244244
bool all_exist = true;
245245
for (int is = 0; is < PARAM.inp.nspin; ++is)
246246
{

source/module_hamilt_lcao/module_deepks/LCAO_deepks_torch.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ void LCAO_Deepks::cal_gevdm(const int nat, std::vector<torch::Tensor>& gevdm)
115115
// repeat each block for nm times in an additional dimension
116116
torch::Tensor tmp_x = this->pdm[inl].reshape({nm, nm}).unsqueeze(0).repeat({nm, 1, 1});
117117
// torch::Tensor tmp_y = std::get<0>(torch::symeig(tmp_x, true));
118-
torch::Tensor tmp_y = std::get<0>(torch::linalg::eigh(tmp_x, "U"));
118+
torch::Tensor tmp_y = std::get<0>(torch::linalg_eigh(tmp_x, "U"));
119119
torch::Tensor tmp_yshell = torch::eye(nm, torch::TensorOptions().dtype(torch::kFloat64));
120120
std::vector<torch::Tensor> tmp_rpt; // repeated-pdm-tensor (x)
121121
std::vector<torch::Tensor> tmp_rdt; // repeated-d-tensor (y)

source/module_hamilt_lcao/module_deepks/cal_descriptor.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ void LCAO_Deepks::cal_descriptor(const int nat)
6767
std::tuple<torch::Tensor, torch::Tensor> d_v(this->d_tensor[inl], vd);
6868
// d_v = torch::symeig(pdm[inl], /*eigenvalues=*/true,
6969
// /*upper=*/true);
70-
d_v = torch::linalg::eigh(pdm[inl], /*uplo*/ "U");
70+
d_v = torch::linalg_eigh(pdm[inl], /*uplo*/ "U");
7171
d_tensor[inl] = std::get<0>(d_v);
7272
}
7373
ModuleBase::timer::tick("LCAO_Deepks", "cal_descriptor");

source/module_hamilt_pw/hamilt_pwdft/forces.cpp

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
#include "module_hamilt_general/module_surchem/surchem.h"
1717
#include "module_hamilt_general/module_vdw/vdw.h"
1818
#include "kernels/force_op.h"
19-
19+
#include <type_traits>
2020
#ifdef _OPENMP
2121
#include <omp.h>
2222
#endif
@@ -579,7 +579,7 @@ void Forces<FPTYPE, Device>::cal_force_loc(const UnitCell& ucell,
579579
syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, forcelc_d, forcelc.c, this->nat * 3);
580580
syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, vloc_d, vloc.c, vloc.nr * vloc.nc);
581581

582-
hamilt::cal_force_loc_op<FPTYPE, Device>()(
582+
hamilt::cal_force_loc_op<FPTYPE, Device>()(
583583
this->nat,
584584
rho_basis->npw,
585585
ucell.tpiba * ucell.omega,
@@ -591,6 +591,8 @@ void Forces<FPTYPE, Device>::cal_force_loc(const UnitCell& ucell,
591591
vloc_d,
592592
vloc.nc,
593593
forcelc_d);
594+
595+
594596
syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, forcelc.c, forcelc_d, this->nat * 3);
595597

596598
delmem_int_op()(this->ctx,iat2it_d);
@@ -799,6 +801,7 @@ void Forces<FPTYPE, Device>::cal_force_ew(const UnitCell& ucell,
799801
aux_d,
800802
forceion_d);
801803

804+
802805
syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, forceion.c, forceion_d, this->nat * 3);
803806
delmem_int_op()(this->ctx,iat2it_d);
804807
delmem_var_op()(this->ctx,gcar_d);
@@ -917,8 +920,25 @@ void Forces<FPTYPE, Device>::cal_force_ew(const UnitCell& ucell,
917920
return;
918921
}
919922

923+
namespace hamilt {
924+
925+
#if defined(__ROCM) || defined(__HIP_PLATFORM_AMD__)
926+
template struct cal_force_ew_sincos_op<double, base_device::DEVICE_GPU>;
927+
template struct cal_force_ew_sincos_op<float, base_device::DEVICE_GPU>;
920928

929+
template struct cal_force_loc_sincos_op<double, base_device::DEVICE_GPU>;
930+
template struct cal_force_loc_sincos_op<float, base_device::DEVICE_GPU>;
931+
#endif
932+
933+
#if defined(__CUDA) || defined(__NVCC__)
934+
template struct cal_force_ew_op<double, base_device::DEVICE_GPU>;
935+
template struct cal_force_ew_op<float, base_device::DEVICE_GPU>;
936+
937+
template struct cal_force_loc_op<double, base_device::DEVICE_GPU>;
938+
template struct cal_force_loc_op<float, base_device::DEVICE_GPU>;
939+
#endif
921940

941+
} // namespace hamilt
922942
template class Forces<double, base_device::DEVICE_CPU>;
923943
#if ((defined __CUDA) || (defined __ROCM))
924944
template class Forces<double, base_device::DEVICE_GPU>;

source/module_hamilt_pw/hamilt_pwdft/global.h

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#include "module_hamilt_general/module_xc/xc_functional.h"
1616
#ifdef __CUDA
1717
#include "cublas_v2.h"
18+
#include <cuda.h> // for CUDA_VERSION
1819
#include "cufft.h"
1920

2021
static const char* _cublasGetErrorString(cublasStatus_t error)
@@ -65,22 +66,27 @@ static const char* _cufftGetErrorString(cufftResult_t error)
6566
return "CUFFT_INVALID_SIZE";
6667
case CUFFT_UNALIGNED_DATA:
6768
return "CUFFT_UNALIGNED_DATA";
68-
case CUFFT_INCOMPLETE_PARAMETER_LIST:
69-
return "CUFFT_INCOMPLETE_PARAMETER_LIST";
7069
case CUFFT_INVALID_DEVICE:
7170
return "CUFFT_INVALID_DEVICE";
72-
case CUFFT_PARSE_ERROR:
73-
return "CUFFT_PARSE_ERROR";
7471
case CUFFT_NO_WORKSPACE:
7572
return "CUFFT_NO_WORKSPACE";
7673
case CUFFT_NOT_IMPLEMENTED:
7774
return "CUFFT_NOT_IMPLEMENTED";
78-
case CUFFT_LICENSE_ERROR:
79-
return "CUFFT_LICENSE_ERROR";
8075
case CUFFT_NOT_SUPPORTED:
8176
return "CUFFT_NOT_SUPPORTED";
77+
78+
#if defined(CUDA_VERSION) && CUDA_VERSION < 13000
79+
case CUFFT_INCOMPLETE_PARAMETER_LIST:
80+
return "CUFFT_INCOMPLETE_PARAMETER_LIST";
81+
case CUFFT_PARSE_ERROR:
82+
return "CUFFT_PARSE_ERROR";
83+
case CUFFT_LICENSE_ERROR:
84+
return "CUFFT_LICENSE_ERROR";
85+
#endif
86+
87+
default:
88+
return "<unknown>";
8289
}
83-
return "<unknown>";
8490
}
8591

8692
#define CHECK_CUDA(func) \

source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h

Lines changed: 56 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,35 @@ struct cal_force_ew_op{
179179
FPTYPE* forceion
180180
) {};
181181
};
182+
183+
template <typename FPTYPE, typename Device>
184+
struct cal_force_loc_sincos_op{
185+
void operator()(
186+
const Device* ctx,
187+
const int nat,
188+
const int npw,
189+
const int ntype,
190+
const FPTYPE* gcar,
191+
const FPTYPE* tau,
192+
const FPTYPE* vloc_per_type,
193+
const std::complex<FPTYPE>* aux,
194+
const FPTYPE& scale_factor,
195+
FPTYPE* force) {};
196+
};
197+
198+
template <typename FPTYPE, typename Device>
199+
struct cal_force_ew_sincos_op{
200+
void operator()(
201+
const Device* ctx,
202+
const int nat,
203+
const int npw,
204+
const int ig_gge0,
205+
const FPTYPE* gcar,
206+
const FPTYPE* tau,
207+
const FPTYPE* it_facts,
208+
const std::complex<FPTYPE>* aux,
209+
FPTYPE* force) {};
210+
};
182211
#if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
183212
template <typename FPTYPE>
184213
struct cal_vkb1_nl_op<FPTYPE, base_device::DEVICE_GPU>
@@ -335,6 +364,32 @@ struct cal_force_ew_op<FPTYPE, base_device::DEVICE_GPU>{
335364
FPTYPE* forceion
336365
);
337366
};
367+
template <typename FPTYPE>
368+
struct cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_GPU> {
369+
void operator()(const base_device::DEVICE_GPU* ctx,
370+
const int& nat,
371+
const int& npw,
372+
const int& ntype,
373+
const FPTYPE* gcar,
374+
const FPTYPE* tau,
375+
const FPTYPE* vloc_per_type,
376+
const std::complex<FPTYPE>* aux,
377+
const FPTYPE& scale_factor,
378+
FPTYPE* force);
379+
};
380+
381+
template <typename FPTYPE>
382+
struct cal_force_ew_sincos_op<FPTYPE, base_device::DEVICE_GPU> {
383+
void operator()(const base_device::DEVICE_GPU* ctx,
384+
const int& nat,
385+
const int& npw,
386+
const int& ig_gge0,
387+
const FPTYPE* gcar,
388+
const FPTYPE* tau,
389+
const FPTYPE* it_facts,
390+
const std::complex<FPTYPE>* aux,
391+
FPTYPE* force);
392+
};
338393
#endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
339394
} // namespace hamilt
340-
#endif // W_ABACUS_DEVELOP_ABACUS_DEVELOP_SOURCE_source_pw_HAMILT_PWDFT_KERNELS_FORCE_OP_H
395+
#endif // W_ABACUS_DEVELOP_ABACUS_DEVELOP_SOURCE_source_pw_HAMILT_PWDFT_KERNELS_FORCE_OP_H

0 commit comments

Comments
 (0)