deepmodeling
diff --git a/‎.github/workflows/cuda.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/cuda.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎CMakeLists.txt‎
Lines changed: 65 additions & 15 deletions b/‎CMakeLists.txt‎
Lines changed: 65 additions & 15 deletions
diff --git a/‎docs/advanced/acceleration/cuda.md‎
Lines changed: 2 additions & 1 deletion b/‎docs/advanced/acceleration/cuda.md‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎examples/36_gpu/si16_lcao/INPUT‎
Lines changed: 2 additions & 3 deletions b/‎examples/36_gpu/si16_lcao/INPUT‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎examples/36_gpu/si16_lcao/KPT‎
Lines changed: 4 additions & 2 deletions b/‎examples/36_gpu/si16_lcao/KPT‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎examples/36_gpu/si16_pw/INPUT‎
Lines changed: 1 addition & 1 deletion b/‎examples/36_gpu/si16_pw/INPUT‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎source/source_base/truncated_func.h‎
Lines changed: 116 additions & 0 deletions b/‎source/source_base/truncated_func.h‎
Lines changed: 116 additions & 0 deletions
diff --git a/‎source/source_basis/module_nao/test/CMakeLists.txt‎
Lines changed: 10 additions & 10 deletions b/‎source/source_basis/module_nao/test/CMakeLists.txt‎
Lines changed: 10 additions & 10 deletions
diff --git a/‎source/source_esolver/esolver_dm2rho.cpp‎
Lines changed: 2 additions & 2 deletions b/‎source/source_esolver/esolver_dm2rho.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎source/source_esolver/esolver_dp.cpp‎
Lines changed: 1 addition & 1 deletion b/‎source/source_esolver/esolver_dp.cpp‎
Lines changed: 1 addition & 1 deletion
@@ -30,7 +30,7 @@ jobs:
           sudo apt-get update
           sudo apt-get install -y ccache
 
-      - name: Build
+      - name: Configure & Build
         run: |
           nvidia-smi
           rm -rf build
 
@@ -129,16 +129,63 @@ if(NOT ENABLE_MPI)
 endif()
 
 # Different exe files of ABACUS
-if(ENABLE_LCAO AND ENABLE_MPI)
-  set(ABACUS_BIN_NAME abacus)
-elseif(NOT ENABLE_LCAO AND ENABLE_MPI)
-  set(ABACUS_BIN_NAME abacus_pw)
-elseif(NOT ENABLE_LCAO AND NOT ENABLE_MPI)
-  set(ABACUS_BIN_NAME abacus_pw_serial)
-elseif(ENABLE_LCAO AND NOT ENABLE_MPI)
-  set(ABACUS_BIN_NAME abacus_serial)
+unset(ABACUS_BIN_NAME CACHE)
+
+# Case : LCAO or PW
+if(ENABLE_LCAO)
+  # Case: CUDA is enabled (suffix with 'g' for GPU)
+  if(USE_CUDA)
+    if(ENABLE_MPI)
+      if(NOT ENABLE_LIBRI AND NOT ENABLE_MLALGO)
+        set(ABACUS_BIN_NAME abacus_2g)
+      elseif(ENABLE_LIBRI AND NOT ENABLE_MLALGO)
+        set(ABACUS_BIN_NAME abacus_3g)
+      elseif(NOT ENABLE_LIBRI AND ENABLE_MLALGO)
+        set(ABACUS_BIN_NAME abacus_4g)
+      elseif(ENABLE_LIBRI AND ENABLE_MLALGO)
+        set(ABACUS_BIN_NAME abacus_5g)
+      endif()
+    endif()
+  # Case: CPU is enabled (suffix with 'p' for parallel) 
+  else()
+    if(ENABLE_MPI)
+      if(NOT ENABLE_LIBRI AND NOT ENABLE_MLALGO)
+        set(ABACUS_BIN_NAME abacus_2p)
+      elseif(ENABLE_LIBRI AND NOT ENABLE_MLALGO)
+        set(ABACUS_BIN_NAME abacus_3p)
+      elseif(NOT ENABLE_LIBRI AND ENABLE_MLALGO)
+        set(ABACUS_BIN_NAME abacus_4p)
+      elseif(ENABLE_LIBRI AND ENABLE_MLALGO)
+        set(ABACUS_BIN_NAME abacus_5p)
+      endif()
+    else()
+      if(NOT ENABLE_LIBRI AND NOT ENABLE_MLALGO)
+        set(ABACUS_BIN_NAME abacus_2s)
+      elseif(ENABLE_LIBRI AND NOT ENABLE_MLALGO)
+        set(ABACUS_BIN_NAME abacus_3s)
+      elseif(NOT ENABLE_LIBRI AND ENABLE_MLALGO)
+        set(ABACUS_BIN_NAME abacus_4s)
+      elseif(ENABLE_LIBRI AND ENABLE_MLALGO)
+        set(ABACUS_BIN_NAME abacus_5s)
+      endif()
+    endif()
+  endif()
+# Case : PW only
+else()
+  if(USE_CUDA)
+    if(ENABLE_MPI)
+      set(ABACUS_BIN_NAME abacus_1g)
+    endif()
+  else()
+    if(ENABLE_MPI)
+      set(ABACUS_BIN_NAME abacus_1p)
+    else()
+      set(ABACUS_BIN_NAME abacus_1s)
+    endif()
+  endif()
 endif()
 
+
 # Use DSP hardware
 if (USE_DSP)
   set(USE_ELPA OFF)
@@ -488,12 +535,12 @@ elseif(NOT USE_SW)
   find_package(Lapack REQUIRED)
   include_directories(${FFTW3_INCLUDE_DIRS})
   list(APPEND math_libs FFTW3::FFTW3 LAPACK::LAPACK BLAS::BLAS)
-if(USE_DSP)
-  target_link_libraries(${ABACUS_BIN_NAME} ${SCALAPACK_LIBRARY_DIR})
-else()
-  find_package(ScaLAPACK REQUIRED)
-  list(APPEND math_libs ScaLAPACK::ScaLAPACK)
-endif()
+  if(USE_DSP)
+    target_link_libraries(${ABACUS_BIN_NAME} ${SCALAPACK_LIBRARY_DIR})
+  else()
+    find_package(ScaLAPACK REQUIRED)
+    list(APPEND math_libs ScaLAPACK::ScaLAPACK)
+  endif()
   if(USE_OPENMP)
     list(APPEND math_libs FFTW3::FFTW3_OMP)
   endif()
@@ -735,7 +782,7 @@ if(ENABLE_LCAO)
     target_link_libraries(${ABACUS_BIN_NAME} genelpa)
   endif()
   if(USE_CUDA)
-    target_link_libraries(diag_cusolver)
+    target_link_libraries(${ABACUS_BIN_NAME} diag_cusolver)
   endif()
 endif()
 if(ENABLE_RAPIDJSON)
@@ -758,6 +805,9 @@ install(PROGRAMS ${ABACUS_BIN_PATH}
         # DESTINATION ${CMAKE_INSTALL_BINDIR}
 )
 
+# Create a symbolic link 'abacus' pointing to the actual executable
+install(CODE "execute_process(COMMAND ${CMAKE_COMMAND} -E create_symlink ${ABACUS_BIN_NAME} ${CMAKE_INSTALL_PREFIX}/bin/abacus WORKING_DIRECTORY ${CMAKE_INSTALL_PREFIX}/bin)")
+
 if(ENABLE_COVERAGE)
   coverage_evaluate()
 endif()
@@ -37,7 +37,8 @@ The ABACUS program will automatically determine whether the current ELPA support
 
 In `INPUT` file we need to set the input parameter [device](../input_files/input-main.md#device) to `gpu`. If this parameter is not set, ABACUS will try to determine if there are available GPUs.
 - Set `ks_solver`: For the PW basis, CG, BPCG and Davidson methods are supported on GPU; set the input parameter [ks_solver](../input_files/input-main.md#ks_solver) to `cg`, `bpcg` or `dav`. For the LCAO basis, `cusolver`, `cusolvermp` and `elpa` is supported on GPU.
-- **multi-card**: ABACUS allows for multi-GPU acceleration. If you have multiple GPU cards, you can run ABACUS with several MPI processes, and each process will utilize one GPU card. For example, the command `mpirun -n 2 abacus` will by default launch two GPUs for computation. If you only have one card, this command will only start one GPU. 
+- **single-card**: ABACUS allows for single-GPU acceleration. You can run ABACUS without any MPI process by command `abacus`, and `ks_solver cusolver` is recommended for the LCAO basis. *note: avoid using `mpirun -n 1 abacus`*.
+- **multi-cards**: ABACUS allows for multi-GPU acceleration. If you have multiple GPU cards, you can run ABACUS with several MPI processes, and each process will utilize one GPU card. For example, the command `mpirun -n 2 abacus` will by default launch two GPUs for computation. If you only have one card, this command will only start one GPU. *note: the number of MPI processes SHOULD be equal to the number of GPU cards, unless you are using MPS in your computer.*
 
 ## Examples
 We provides [examples](https://github.com/deepmodeling/abacus-develop/tree/develop/examples/gpu) of gpu calculations.
 
@@ -3,9 +3,8 @@ INPUT_PARAMETERS
 suffix			autotest
 calculation     scf
 device          gpu
-gamma_only      1  # GPU acceleration currently only support gamma_only set to 1. ### Abacus will generate/overwrite a KPT file when gamma_only is set to 1.
 ks_solver		cusolver  # if not set, the default ks_solver is cusolver,
-                          # you can also choose genelpa or scalapack_gvx.
+                          # you can also choose cusolvermp or elpa if compiled.
 
 #nbands			8
 symmetry		1
@@ -26,7 +25,7 @@ smearing_sigma		0.002
 
 #Parameters (5.Mixing)
 mixing_type		broyden
-mixing_beta		0.3
+mixing_beta		0.4
 
 
 ### [1] Energy cutoff determines the quality of numerical quadratures in your calculations.
 
@@ -1,5 +1,7 @@
 K_POINTS
 0
 Gamma
-1 1 1 0 0 0
-###This file will be overwritten by Abacus because either kspacing is used or gamma_only is set to 1
+5 5 5 0 0 0
+### If you are running an energy calculation, please make sure your final energy is
+### converged with respect to the k-point settings, unless you set a loose k-point
+### mesh on purpose.
@@ -21,7 +21,7 @@ smearing_sigma		0.002
 
 #Parameters (5.Mixing)
 mixing_type		broyden
-mixing_beta		0.3
+mixing_beta		0.4
 
 
 ### [1] Energy cutoff determines the quality of numerical quadratures in your calculations.
 
@@ -0,0 +1,116 @@
+#ifndef MODULE_BASE_TRUNCATED_FUNC_H
+#define MODULE_BASE_TRUNCATED_FUNC_H
+
+#include "source_base/libm/libm.h"
+#include <cstdint>
+#include <cstring>
+#include <complex>
+
+namespace ModuleBase
+{
+
+/**
+ * @brief Truncated exponential function to avoid underflow.
+ *
+ * This function returns 0 if the real part of the input is less than -230.0,
+ * otherwise it calls ModuleBase::libm::exp(x).
+ *
+ * @tparam FPTYPE The floating point type (float, double, or complex).
+ * @param x The input value.
+ * @return FPTYPE The result of the exponential function.
+ */
+template <typename FPTYPE>
+inline FPTYPE truncated_exp(FPTYPE x)
+{
+    if (std::real(x) < -230.0)
+    {
+        return static_cast<FPTYPE>(0.0);
+    }
+    return ModuleBase::libm::exp(x);
+}
+
+/**
+ * @brief Truncated complementary error function to avoid underflow for large arguments.
+ *
+ * This function returns 0 if the real part of the input is greater than 20.0,
+ * otherwise it calls std::erfc(x).
+ *
+ * @tparam FPTYPE The floating point type (float, double, or complex).
+ * @param x The input value.
+ * @return FPTYPE The result of the erfc function.
+ */
+template <typename FPTYPE>
+inline FPTYPE truncated_erfc(FPTYPE x)
+{
+    if (std::real(x) > 20.0)
+    {
+        return static_cast<FPTYPE>(0.0);
+    }
+    return std::erfc(x);
+}
+
+/**
+ * @brief Truncated value function to avoid underflow.
+ *
+ * This function returns 0 if the absolute value of the input is less than 1.0e-30,
+ * otherwise it returns the input x.
+ *
+ * @tparam FPTYPE The floating point type (float, double, or complex).
+ * @param x The input value.
+ * @return FPTYPE The result of the truncation.
+ */
+/**
+ * @brief Truncated value function to avoid underflow.
+ *
+ * This function modifies the input to 0 if its absolute value is less than 1.0e-30.
+ *
+ * @tparam FPTYPE The floating point type (float, double, or complex).
+ * @param x The input value to be checked and possibly truncated.
+ */
+template <typename FPTYPE>
+inline void truncated_underflow(FPTYPE& x)
+{
+    if (std::abs(x) < 1.0e-30)
+    {
+        x = static_cast<FPTYPE>(0.0);
+    }
+}
+
+template <>
+inline void truncated_underflow(double& x)
+{
+    const uint64_t u = *reinterpret_cast<const uint64_t*>(&x);
+    // The exponent bits are 52-62 (11 bits). The bias is 1023.
+    // 1e-30 corresponds to -100 in base-2 exponent roughly.
+    // 923 = 1023 - 100.
+    if (((u >> 52) & 0x7FF) <= 923)
+    {
+        x = 0.0;
+    }
+}
+
+template <>
+inline void truncated_underflow(float& x)
+{
+    const uint32_t u = *reinterpret_cast<const uint32_t*>(&x);
+    // The exponent bits are 23-30 (8 bits). The bias is 127.
+    // 1e-30 corresponds to -100 in base-2 exponent roughly.
+    // 27 = 127 - 100.
+    if (((u >> 23) & 0xFF) <= 27)
+    {
+        x = 0.0f;
+    }
+}
+
+template <typename T>
+inline void truncated_underflow(std::complex<T>& x)
+{
+    T* ptr = reinterpret_cast<T*>(&x);
+    truncated_underflow(ptr[0]);
+    truncated_underflow(ptr[1]);
+}
+
+
+} // namespace ModuleBase
+
+#endif // MODULE_BASE_TRUNCATED_FUNC_H
@@ -16,7 +16,7 @@ AddTest(
     ../numerical_radial.cpp
     ../../module_ao/ORB_atomic_lm.cpp
     ../../module_ao/ORB_atomic.cpp
-    ../../../source_io/orb_io.cpp
+    ../../../source_io/module_output/orb_io.cpp
   LIBS parameter ${math_libs} device base 
 )
 
@@ -29,7 +29,7 @@ AddTest(
     ../numerical_radial.cpp
     ../../module_ao/ORB_atomic_lm.cpp
     ../../module_ao/ORB_atomic.cpp
-    ../../../source_io/orb_io.cpp
+    ../../../source_io/module_output/orb_io.cpp
   LIBS parameter ${math_libs} device base 
 )
 
@@ -42,7 +42,7 @@ AddTest(
     ../numerical_radial.cpp
     ../../module_ao/ORB_atomic_lm.cpp
     ../../module_ao/ORB_atomic.cpp
-    ../../../source_io/orb_io.cpp
+    ../../../source_io/module_output/orb_io.cpp
   LIBS parameter ${math_libs} device base 
 )
 
@@ -55,7 +55,7 @@ AddTest(
     ../numerical_radial.cpp
     ../../module_ao/ORB_atomic_lm.cpp
     ../../module_ao/ORB_atomic.cpp
-    ../../../source_io/orb_io.cpp
+    ../../../source_io/module_output/orb_io.cpp
   LIBS parameter ${math_libs} device base 
 )
 
@@ -68,7 +68,7 @@ AddTest(
     ../numerical_radial.cpp
     ../../module_ao/ORB_atomic_lm.cpp
     ../../module_ao/ORB_atomic.cpp
-    ../../../source_io/orb_io.cpp
+    ../../../source_io/module_output/orb_io.cpp
  LIBS parameter ${math_libs} device base 
 )
 
@@ -86,7 +86,7 @@ AddTest(
     ../sphbes_radials.cpp
     ../../module_ao/ORB_atomic_lm.cpp
     ../../module_ao/ORB_atomic.cpp
-    ../../../source_io/orb_io.cpp
+    ../../../source_io/module_output/orb_io.cpp
   LIBS parameter ${math_libs} device base 
 )
 
@@ -106,7 +106,7 @@ AddTest(
     ../two_center_bundle.cpp
     ../two_center_integrator.cpp
     ../real_gaunt_table.cpp
-    ../../../source_io/orb_io.cpp
+    ../../../source_io/module_output/orb_io.cpp
   LIBS parameter ${math_libs} device base container orb 
 )
 
@@ -135,7 +135,7 @@ AddTest(
     ../radial_set.cpp
     ../numerical_radial.cpp
     ../two_center_bundle.cpp
-    ../../../source_io/orb_io.cpp
+    ../../../source_io/module_output/orb_io.cpp
   LIBS parameter ${math_libs} device base container orb 
 )
 
@@ -155,7 +155,7 @@ AddTest(
     ../sphbes_radials.cpp
     ../radial_set.cpp
     ../numerical_radial.cpp
-    ../../../source_io/orb_io.cpp
+    ../../../source_io/module_output/orb_io.cpp
   LIBS parameter ${math_libs} device base container orb 
 )
 
@@ -175,7 +175,7 @@ AddTest(
     ../sphbes_radials.cpp
     ../radial_set.cpp
     ../numerical_radial.cpp
-    ../../../source_io/orb_io.cpp
+    ../../../source_io/module_output/orb_io.cpp
   LIBS parameter ${math_libs} device base container orb 
 )
 
@@ -7,8 +7,8 @@
 #include "source_lcao/LCAO_domain.h"
 #include "source_lcao/hamilt_lcao.h"
 #include "source_lcao/module_operator_lcao/operator_lcao.h"
-#include "source_io/cube_io.h"
-#include "../source_io/module_ml/io_npz.h"
+#include "source_io/module_output/cube_io.h"
+#include "source_io/module_ml/io_npz.h"
 #include "source_io/module_output/print_info.h"
 #include "source_lcao/rho_tau_lcao.h" // mohan add 2025-10-24
 
 
@@ -22,7 +22,7 @@
 #include "source_base/parallel_common.h"
 #include "source_base/timer.h"
 #include "source_io/module_output/output_log.h"
-#include "source_io/cif_io.h"
+#include "source_io/module_output/cif_io.h"
 
 #include <iomanip>
 #include <sstream>
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,7 @@`
`1`	`1`	`K_POINTS`
`2`	`2`	`0`
`3`	`3`	`Gamma`
`4`		`-1 1 1 0 0 0`
`5`		`-###This file will be overwritten by Abacus because either kspacing is used or gamma_only is set to 1`
	`4`	`+5 5 5 0 0 0`
	`5`	`+### If you are running an energy calculation, please make sure your final energy is`
	`6`	`+### converged with respect to the k-point settings, unless you set a loose k-point`
	`7`	`+### mesh on purpose.`