deepmodeling
diff --git a/‎CMakeLists.txt‎
Lines changed: 22 additions & 9 deletions b/‎CMakeLists.txt‎
Lines changed: 22 additions & 9 deletions
diff --git a/‎docs/advanced/input_files/input-main.md‎
Lines changed: 13 additions & 0 deletions b/‎docs/advanced/input_files/input-main.md‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎source/module_base/module_device/device.cpp‎
Lines changed: 12 additions & 5 deletions b/‎source/module_base/module_device/device.cpp‎
Lines changed: 12 additions & 5 deletions
diff --git a/‎source/module_esolver/esolver_ks_pw.cpp‎
Lines changed: 2 additions & 1 deletion b/‎source/module_esolver/esolver_ks_pw.cpp‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/op_exx_lcao.hpp‎
Lines changed: 1 addition & 1 deletion b/‎source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/op_exx_lcao.hpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎source/module_hamilt_lcao/module_deepks/LCAO_deepks_torch.cpp‎
Lines changed: 1 addition & 1 deletion b/‎source/module_hamilt_lcao/module_deepks/LCAO_deepks_torch.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎source/module_hamilt_lcao/module_deepks/cal_descriptor.cpp‎
Lines changed: 1 addition & 1 deletion b/‎source/module_hamilt_lcao/module_deepks/cal_descriptor.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎source/module_hamilt_pw/hamilt_pwdft/forces.cpp‎
Lines changed: 22 additions & 2 deletions b/‎source/module_hamilt_pw/hamilt_pwdft/forces.cpp‎
Lines changed: 22 additions & 2 deletions
diff --git a/‎source/module_hamilt_pw/hamilt_pwdft/global.h‎
Lines changed: 13 additions & 7 deletions b/‎source/module_hamilt_pw/hamilt_pwdft/global.h‎
Lines changed: 13 additions & 7 deletions
diff --git a/‎source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h‎
Lines changed: 56 additions & 1 deletion b/‎source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h‎
Lines changed: 56 additions & 1 deletion
@@ -153,7 +153,9 @@ set(ABACUS_BIN_PATH ${CMAKE_CURRENT_BINARY_DIR}/${ABACUS_BIN_NAME})
 include_directories(${ABACUS_SOURCE_DIR})
 include_directories(${ABACUS_SOURCE_DIR}/module_base/module_container)
 
-set(CMAKE_CXX_STANDARD 11)
+if(NOT DEFINED CMAKE_CXX_STANDARD)
+  set(CMAKE_CXX_STANDARD 11)
+endif()
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 
 add_executable(${ABACUS_BIN_NAME} source/main.cpp)
@@ -293,22 +295,33 @@ endif()
 
 if(USE_CUDA)
   cmake_minimum_required(VERSION 3.18) # required by `CUDA_ARCHITECTURES` below
+  find_package(CUDAToolkit REQUIRED)
   set_if_higher(CMAKE_CXX_STANDARD 14)
+  if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL "13.0")
+    message(STATUS "CUDA ${CUDAToolkit_VERSION} detected. Setting CMAKE_CUDA_STANDARD to 17.")
+    set_if_higher(CMAKE_CXX_STANDARD 17)
+  endif()
   set(CMAKE_CXX_EXTENSIONS ON)
   set(CMAKE_CUDA_STANDARD ${CMAKE_CXX_STANDARD})
   set(CMAKE_CUDA_STANDARD_REQUIRED ON)
   set(CMAKE_CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER})
   if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
-    find_package(CUDAToolkit REQUIRED)
     # check
     # https://gitlab.kitware.com/cmake/cmake/-/blob/master/Modules/Internal/CMakeCUDAArchitecturesAll.cmake
-    # for available architechures in different CUDA versions
-    set(CMAKE_CUDA_ARCHITECTURES
-        60 # P100
-        70 # V100
-        # Add your CUDA arch here Check the Compute Capability version of your
-        # GPU at: https://en.wikipedia.org/wiki/CUDA#GPUs_supported
-    )
+    # for available architectures in different CUDA versions
+    # CUDA 13.0+ dropped support for architectures below 75
+    if(CUDAToolkit_VERSION VERSION_LESS "13.0")
+      set(CMAKE_CUDA_ARCHITECTURES
+          60 # P100
+          70 # V100
+      )
+    else()
+      # Start with empty list; architectures 75+ will be added below
+      set(CMAKE_CUDA_ARCHITECTURES)
+    endif()
+    
+    # Add your CUDA arch here Check the Compute Capability version of your
+    # GPU at: https://en.wikipedia.org/wiki/CUDA#GPUs_supported
     if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 10.0)
       list(APPEND CMAKE_CUDA_ARCHITECTURES 75) # T4
     endif()
 
@@ -37,6 +37,7 @@
     - [pw\_seed](#pw_seed)
     - [pw\_diag\_thr](#pw_diag_thr)
     - [diago\_smooth\_ethr](#diago_smooth_ethr)
+    - [use\_k\_continuity](#use_k_continuity)
     - [pw\_diag\_nmax](#pw_diag_nmax)
     - [pw\_diag\_ndim](#pw_diag_ndim)
     - [erf\_ecut](#erf_ecut)
@@ -774,6 +775,18 @@ These variables are used to control the plane wave related parameters.
 - **Description**: If `TRUE`, the smooth threshold strategy, which applies a larger threshold (10e-5) for the empty states, will be implemented in the diagonalization methods. (This strategy should not affect total energy, forces, and other ground-state properties, but computational efficiency will be improved.) If `FALSE`, the smooth threshold strategy will not be applied.
 - **Default**: false
 
+### use_k_continuity
+
+- **Type**: Boolean
+- **Availability**: Used only for plane wave basis set.
+- **Description**: Whether to use k-point continuity for initializing wave functions. When enabled, this strategy exploits the similarity between wavefunctions at neighboring k-points by propagating the wavefunction from a previously initialized k-point to a new k-point, significantly reducing the computational cost of the initial guess.
+
+  **Important constraints:**
+  - Must be used together with `diago_smooth_ethr = 1` for optimal performance
+
+  This feature is particularly useful for calculations with dense k-point sampling where the computational cost of wavefunction initialization becomes significant.
+- **Default**: false
+
 ### pw_diag_nmax
 
 - **Type**: Integer
 
@@ -12,6 +12,7 @@
 
 #if defined(__CUDA)
 #include <cuda_runtime.h>
+#include <cuda.h>
 #endif
 
 #if defined(__ROCM)
@@ -299,6 +300,7 @@ void print_device_info<base_device::DEVICE_GPU>(
   sprintf(msg, "  CUDA Capability Major/Minor version number:    %d.%d\n",
           deviceProp.major, deviceProp.minor);
   ofs_device << msg << std::endl;
+#if defined(CUDA_VERSION) && CUDA_VERSION < 13000
   sprintf(msg,
           "  GPU Max Clock rate:                            %.0f MHz (%0.2f "
           "GHz)\n",
@@ -312,6 +314,7 @@ void print_device_info<base_device::DEVICE_GPU>(
   sprintf(msg, "  Memory Bus Width:                              %d-bit\n",
           deviceProp.memoryBusWidth);
   ofs_device << msg << std::endl;
+#endif
   sprintf(msg,
           "  Maximum Texture Dimension Size (x,y,z)         1D=(%d), 2D=(%d, "
           "%d), 3D=(%d, %d, %d)\n",
@@ -366,6 +369,7 @@ void print_device_info<base_device::DEVICE_GPU>(
   sprintf(msg, "  Texture alignment:                             %zu bytes\n",
           deviceProp.textureAlignment);
   ofs_device << msg << std::endl;
+#if defined(CUDA_VERSION) && CUDA_VERSION < 13000
   sprintf(msg,
           "  Concurrent copy and kernel execution:          %s with %d copy "
           "engine(s)\n",
@@ -375,6 +379,7 @@ void print_device_info<base_device::DEVICE_GPU>(
   sprintf(msg, "  Run time limit on kernels:                     %s\n",
           deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No");
   ofs_device << msg << std::endl;
+#endif
   sprintf(msg, "  Integrated GPU sharing Host Memory:            %s\n",
           deviceProp.integrated ? "Yes" : "No");
   ofs_device << msg << std::endl;
@@ -399,13 +404,15 @@ void print_device_info<base_device::DEVICE_GPU>(
   sprintf(msg, "  Supports Cooperative Kernel Launch:            %s\n",
           deviceProp.cooperativeLaunch ? "Yes" : "No");
   ofs_device << msg << std::endl;
-  sprintf(msg, "  Supports MultiDevice Co-op Kernel Launch:      %s\n",
-          deviceProp.cooperativeMultiDeviceLaunch ? "Yes" : "No");
-  ofs_device << msg << std::endl;
   sprintf(msg,
           "  Device PCI Domain ID / Bus ID / location ID:   %d / %d / %d\n",
           deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);
   ofs_device << msg << std::endl;
+#if defined(CUDA_VERSION) && CUDA_VERSION < 13000
+  sprintf(msg, "  Supports MultiDevice Co-op Kernel Launch:      %s\n",
+          deviceProp.cooperativeMultiDeviceLaunch ? "Yes" : "No");
+  ofs_device << msg << std::endl;
+
   const char *sComputeMode[] = {
       "Default (multiple host threads can use ::cudaSetDevice() with device "
       "simultaneously)",
@@ -421,7 +428,7 @@ void print_device_info<base_device::DEVICE_GPU>(
   ofs_device << msg << std::endl;
   ofs_device << "  " << sComputeMode[deviceProp.computeMode] << std::endl
              << std::endl;
-
+#endif
   // If there are 2 or more GPUs, query to determine whether RDMA is supported
   if (deviceCount >= 2) {
     cudaDeviceProp prop[64];
@@ -711,4 +718,4 @@ void record_device_memory<base_device::DEVICE_GPU>(
 #endif
 
 } // end of namespace information
-} // end of namespace base_device
+} // end of namespace base_device
@@ -520,7 +520,8 @@ void ESolver_KS_PW<T, Device>::hamilt2density_single(UnitCell& ucell,
                                                      hsolver::DiagoIterAssist<T, Device>::SCF_ITER,
                                                      hsolver::DiagoIterAssist<T, Device>::PW_DIAG_NMAX,
                                                      hsolver::DiagoIterAssist<T, Device>::PW_DIAG_THR,
-                                                     hsolver::DiagoIterAssist<T, Device>::need_subspace);
+                                                     hsolver::DiagoIterAssist<T, Device>::need_subspace,
+                                                     PARAM.inp.use_k_continuity);
 
         hsolver_pw_obj.solve(this->p_hamilt,
                              this->kspw_psi[0],
 
@@ -240,7 +240,7 @@ OperatorEXX<OperatorLCAO<TK, TR>>::OperatorEXX(HS_Matrix_K<TK>* hsk_in,
             else if (this->add_hexx_type == Add_Hexx_Type::R)
             {
                 // read in Hexx(R)
-                const std::string restart_HR_path = PARAM.globalv.global_readin_dir + "HexxR" + std::to_string(PARAM.globalv.myrank);
+                const std::string restart_HR_path = GlobalC::restart.folder + "HexxR" + std::to_string(PARAM.globalv.myrank);
                 bool all_exist = true;
                 for (int is = 0; is < PARAM.inp.nspin; ++is)
                 {
 
@@ -115,7 +115,7 @@ void LCAO_Deepks::cal_gevdm(const int nat, std::vector<torch::Tensor>& gevdm)
             // repeat each block for nm times in an additional dimension
             torch::Tensor tmp_x = this->pdm[inl].reshape({nm, nm}).unsqueeze(0).repeat({nm, 1, 1});
             // torch::Tensor tmp_y = std::get<0>(torch::symeig(tmp_x, true));
-            torch::Tensor tmp_y = std::get<0>(torch::linalg::eigh(tmp_x, "U"));
+            torch::Tensor tmp_y = std::get<0>(torch::linalg_eigh(tmp_x, "U"));
             torch::Tensor tmp_yshell = torch::eye(nm, torch::TensorOptions().dtype(torch::kFloat64));
             std::vector<torch::Tensor> tmp_rpt; // repeated-pdm-tensor (x)
             std::vector<torch::Tensor> tmp_rdt; // repeated-d-tensor (y)
 
@@ -67,7 +67,7 @@ void LCAO_Deepks::cal_descriptor(const int nat)
         std::tuple<torch::Tensor, torch::Tensor> d_v(this->d_tensor[inl], vd);
         // d_v = torch::symeig(pdm[inl], /*eigenvalues=*/true,
         // /*upper=*/true);
-        d_v = torch::linalg::eigh(pdm[inl], /*uplo*/ "U");
+        d_v = torch::linalg_eigh(pdm[inl], /*uplo*/ "U");
         d_tensor[inl] = std::get<0>(d_v);
     }
     ModuleBase::timer::tick("LCAO_Deepks", "cal_descriptor");
 
@@ -16,7 +16,7 @@
 #include "module_hamilt_general/module_surchem/surchem.h"
 #include "module_hamilt_general/module_vdw/vdw.h"
 #include "kernels/force_op.h"
-
+#include <type_traits>
 #ifdef _OPENMP
 #include <omp.h>
 #endif
@@ -579,7 +579,7 @@ void Forces<FPTYPE, Device>::cal_force_loc(const UnitCell& ucell,
         syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, forcelc_d, forcelc.c, this->nat * 3);
         syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, vloc_d, vloc.c, vloc.nr * vloc.nc);
 
-        hamilt::cal_force_loc_op<FPTYPE, Device>()(
+       hamilt::cal_force_loc_op<FPTYPE, Device>()(
             this->nat,
             rho_basis->npw,
             ucell.tpiba * ucell.omega,
@@ -591,6 +591,8 @@ void Forces<FPTYPE, Device>::cal_force_loc(const UnitCell& ucell,
             vloc_d,
             vloc.nc,
             forcelc_d);
+
+
         syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, forcelc.c, forcelc_d, this->nat * 3);
 
         delmem_int_op()(this->ctx,iat2it_d);
@@ -799,6 +801,7 @@ void Forces<FPTYPE, Device>::cal_force_ew(const UnitCell& ucell,
             aux_d,
             forceion_d);
 
+
         syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, forceion.c, forceion_d, this->nat * 3);
         delmem_int_op()(this->ctx,iat2it_d);
         delmem_var_op()(this->ctx,gcar_d);
@@ -917,8 +920,25 @@ void Forces<FPTYPE, Device>::cal_force_ew(const UnitCell& ucell,
     return;
 }
 
+namespace hamilt {
+
+#if defined(__ROCM) || defined(__HIP_PLATFORM_AMD__)
+    template struct cal_force_ew_sincos_op<double, base_device::DEVICE_GPU>;
+    template struct cal_force_ew_sincos_op<float, base_device::DEVICE_GPU>;
 
+    template struct cal_force_loc_sincos_op<double, base_device::DEVICE_GPU>;
+    template struct cal_force_loc_sincos_op<float, base_device::DEVICE_GPU>;
+#endif
+
+#if defined(__CUDA) || defined(__NVCC__)
+    template struct cal_force_ew_op<double, base_device::DEVICE_GPU>;
+    template struct cal_force_ew_op<float, base_device::DEVICE_GPU>;
+
+    template struct cal_force_loc_op<double, base_device::DEVICE_GPU>;
+    template struct cal_force_loc_op<float, base_device::DEVICE_GPU>;
+#endif
 
+} // namespace hamilt
 template class Forces<double, base_device::DEVICE_CPU>;
 #if ((defined __CUDA) || (defined __ROCM))
 template class Forces<double, base_device::DEVICE_GPU>;
 
@@ -15,6 +15,7 @@
 #include "module_hamilt_general/module_xc/xc_functional.h"
 #ifdef __CUDA
 #include "cublas_v2.h"
+#include <cuda.h> // for CUDA_VERSION
 #include "cufft.h"
 
 static const char* _cublasGetErrorString(cublasStatus_t error)
@@ -65,22 +66,27 @@ static const char* _cufftGetErrorString(cufftResult_t error)
         return "CUFFT_INVALID_SIZE";
     case CUFFT_UNALIGNED_DATA:
         return "CUFFT_UNALIGNED_DATA";
-    case CUFFT_INCOMPLETE_PARAMETER_LIST:
-        return "CUFFT_INCOMPLETE_PARAMETER_LIST";
     case CUFFT_INVALID_DEVICE:
         return "CUFFT_INVALID_DEVICE";
-    case CUFFT_PARSE_ERROR:
-        return "CUFFT_PARSE_ERROR";
     case CUFFT_NO_WORKSPACE:
         return "CUFFT_NO_WORKSPACE";
     case CUFFT_NOT_IMPLEMENTED:
         return "CUFFT_NOT_IMPLEMENTED";
-    case CUFFT_LICENSE_ERROR:
-        return "CUFFT_LICENSE_ERROR";
     case CUFFT_NOT_SUPPORTED:
         return "CUFFT_NOT_SUPPORTED";
+    
+#if defined(CUDA_VERSION) && CUDA_VERSION < 13000
+    case CUFFT_INCOMPLETE_PARAMETER_LIST:
+        return "CUFFT_INCOMPLETE_PARAMETER_LIST";
+    case CUFFT_PARSE_ERROR:
+        return "CUFFT_PARSE_ERROR";
+    case CUFFT_LICENSE_ERROR:
+        return "CUFFT_LICENSE_ERROR";
+#endif
+    
+    default:
+        return "<unknown>";
     }
-    return "<unknown>";
 }
 
 #define CHECK_CUDA(func)                                                                                               \
 
@@ -179,6 +179,35 @@ struct cal_force_ew_op{
         FPTYPE* forceion
     ) {};
 };
+
+template <typename FPTYPE, typename Device>
+struct cal_force_loc_sincos_op{
+    void operator()(
+            const Device* ctx,
+            const int nat,
+            const int npw,
+            const int ntype,
+            const FPTYPE* gcar,
+            const FPTYPE* tau,
+            const FPTYPE* vloc_per_type,
+            const std::complex<FPTYPE>* aux,
+            const FPTYPE& scale_factor,
+            FPTYPE* force) {};
+};
+
+template <typename FPTYPE, typename Device>
+struct cal_force_ew_sincos_op{
+    void operator()(
+            const Device* ctx,
+            const int nat,
+            const int npw,
+            const int ig_gge0,
+            const FPTYPE* gcar,
+            const FPTYPE* tau,
+            const FPTYPE* it_facts,
+            const std::complex<FPTYPE>* aux,
+            FPTYPE* force) {};
+};
 #if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
 template <typename FPTYPE>
 struct cal_vkb1_nl_op<FPTYPE, base_device::DEVICE_GPU>
@@ -335,6 +364,32 @@ struct cal_force_ew_op<FPTYPE, base_device::DEVICE_GPU>{
         FPTYPE* forceion
     );
 };
+template <typename FPTYPE>
+struct cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_GPU> {
+    void operator()(const base_device::DEVICE_GPU* ctx,
+            const int& nat,
+            const int& npw,
+            const int& ntype,
+            const FPTYPE* gcar,
+            const FPTYPE* tau,
+            const FPTYPE* vloc_per_type,
+            const std::complex<FPTYPE>* aux,
+            const FPTYPE& scale_factor,
+            FPTYPE* force);
+};
+
+template <typename FPTYPE>
+struct cal_force_ew_sincos_op<FPTYPE, base_device::DEVICE_GPU> {
+    void operator()(const base_device::DEVICE_GPU* ctx,
+            const int& nat,
+            const int& npw,
+            const int& ig_gge0,
+            const FPTYPE* gcar,
+            const FPTYPE* tau,
+            const FPTYPE* it_facts,
+            const std::complex<FPTYPE>* aux,
+            FPTYPE* force);
+};
 #endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
 } // namespace hamilt
-#endif // W_ABACUS_DEVELOP_ABACUS_DEVELOP_SOURCE_source_pw_HAMILT_PWDFT_KERNELS_FORCE_OP_H
+#endif // W_ABACUS_DEVELOP_ABACUS_DEVELOP_SOURCE_source_pw_HAMILT_PWDFT_KERNELS_FORCE_OP_H
Original file line number	Diff line number	Diff line change
`@@ -240,7 +240,7 @@ OperatorEXX<OperatorLCAO<TK, TR>>::OperatorEXX(HS_Matrix_K<TK>* hsk_in,`
`240`	`240`	`else if (this->add_hexx_type == Add_Hexx_Type::R)`
`241`	`241`	`{`
`242`	`242`	`// read in Hexx(R)`
`243`		`- const std::string restart_HR_path = PARAM.globalv.global_readin_dir + "HexxR" + std::to_string(PARAM.globalv.myrank);`
	`243`	`+ const std::string restart_HR_path = GlobalC::restart.folder + "HexxR" + std::to_string(PARAM.globalv.myrank);`
`244`	`244`	`bool all_exist = true;`
`245`	`245`	`for (int is = 0; is < PARAM.inp.nspin; ++is)`
`246`	`246`	`{`
Original file line number	Diff line number	Diff line change
`@@ -67,7 +67,7 @@ void LCAO_Deepks::cal_descriptor(const int nat)`
`67`	`67`	`std::tuple<torch::Tensor, torch::Tensor> d_v(this->d_tensor[inl], vd);`
`68`	`68`	`// d_v = torch::symeig(pdm[inl], /eigenvalues=/true,`
`69`	`69`	`// /upper=/true);`
`70`		`- d_v = torch::linalg::eigh(pdm[inl], /uplo/ "U");`
	`70`	`+ d_v = torch::linalg_eigh(pdm[inl], /uplo/ "U");`
`71`	`71`	`d_tensor[inl] = std::get<0>(d_v);`
`72`	`72`	`}`
`73`	`73`	`ModuleBase::timer::tick("LCAO_Deepks", "cal_descriptor");`