deepmodeling
diff --git a/‎.github/workflows/coverage.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/coverage.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 39 additions & 23 deletions b/‎CMakeLists.txt‎
Lines changed: 39 additions & 23 deletions
diff --git a/‎docs/advanced/acceleration/cuda.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/advanced/acceleration/cuda.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/advanced/elec_properties/Berry_phase.md‎
Lines changed: 4 additions & 2 deletions b/‎docs/advanced/elec_properties/Berry_phase.md‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎docs/advanced/input_files/input-main.md‎
Lines changed: 17 additions & 9 deletions b/‎docs/advanced/input_files/input-main.md‎
Lines changed: 17 additions & 9 deletions
diff --git a/‎docs/advanced/input_files/kpt.md‎
Lines changed: 4 additions & 4 deletions b/‎docs/advanced/input_files/kpt.md‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎docs/advanced/install.md‎
Lines changed: 25 additions & 10 deletions b/‎docs/advanced/install.md‎
Lines changed: 25 additions & 10 deletions
@@ -15,10 +15,10 @@ jobs:
         uses: actions/checkout@v4
       - name: Install Requirements for Coverage Testing
         run: |
-          apt update && apt install -y lcov
+          apt update && apt install -y lcov gpg
       - name: Building
         run: |
-          cmake -B build -DENABLE_DEEPKS=ON -DENABLE_LIBXC=ON -DBUILD_TESTING=ON -DENABLE_COVERAGE=ON
+          cmake -B build -DENABLE_COVERAGE=ON -DBUILD_TESTING=ON -DENABLE_DEEPKS=ON -DENABLE_LIBXC=ON -DENABLE_LIBRI=ON -DENABLE_PAW=ON -DENABLE_GOOGLEBENCH=ON -DENABLE_RAPIDJSON=ON
           cmake --build build -j`nproc`
           cmake --install build
       - name: Testing
 
@@ -39,6 +39,7 @@ option(ENABLE_RAPIDJSON "Enable rapid-json usage." OFF)
 option(ENABLE_CNPY "Enable cnpy usage." OFF)
 option(ENABLE_PEXSI "Enable support for PEXSI." OFF)
 option(ENABLE_CUSOLVERMP "Enable cusolvermp." OFF)
+option(USE_DSP "Enable DSP usage." OFF)
 
 # enable json support
 if(ENABLE_RAPIDJSON)
@@ -119,6 +120,12 @@ elseif(ENABLE_LCAO AND NOT ENABLE_MPI)
   set(ABACUS_BIN_NAME abacus_serial)
 endif()
 
+if (USE_DSP)
+  set(USE_ELPA OFF)
+  set(ENABLE_LCAO OFF)
+  set(ABACUS_BIN_NAME abacus_dsp)
+endif()
+
 list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
 
 if(ENABLE_COVERAGE)
@@ -240,6 +247,11 @@ if(ENABLE_MPI)
   list(APPEND math_libs MPI::MPI_CXX)
 endif()
 
+if (USE_DSP)
+  target_link_libraries(${ABACUS_BIN_NAME} ${DIR_MTBLAS_LIBRARY})
+  add_compile_definitions(__DSP)
+endif()
+
 find_package(Threads REQUIRED)
 target_link_libraries(${ABACUS_BIN_NAME} Threads::Threads)
 
@@ -436,22 +448,6 @@ if(ENABLE_FLOAT_FFTW)
 endif()
 
 if(ENABLE_DEEPKS)
-  # Torch uses outdated components to detect CUDA arch, causing failure on
-  # latest CUDA kits. Set CMake variable TORCH_CUDA_ARCH_LIST in the form of
-  # "major.minor" if required.
-  find_package(Torch REQUIRED)
-  if(NOT Torch_VERSION VERSION_LESS "2.1.0")
-    set_if_higher(CMAKE_CXX_STANDARD 17)
-  elseif(NOT Torch_VERSION VERSION_LESS "1.5.0")
-    set_if_higher(CMAKE_CXX_STANDARD 14)
-  endif()
-  include_directories(${TORCH_INCLUDE_DIRS})
-  if(MKL_FOUND)
-    list(PREPEND math_libs ${TORCH_LIBRARIES})
-  else()
-    list(APPEND math_libs ${TORCH_LIBRARIES})
-  endif()
-  add_compile_options(${TORCH_CXX_FLAGS})
   target_link_libraries(${ABACUS_BIN_NAME} deepks)
 
   find_path(libnpy_SOURCE_DIR npy.hpp HINTS ${libnpy_INCLUDE_DIR})
@@ -470,6 +466,25 @@ if(ENABLE_DEEPKS)
   add_compile_definitions(__DEEPKS)
 endif()
 
+# Torch uses outdated components to detect CUDA arch, causing failure on
+# latest CUDA kits. Set CMake variable TORCH_CUDA_ARCH_LIST in the form of
+# "major.minor" if required.
+if(ENABLE_DEEPKS OR DEFINED Torch_DIR)
+  find_package(Torch REQUIRED)
+  if(NOT Torch_VERSION VERSION_LESS "2.1.0")
+    set_if_higher(CMAKE_CXX_STANDARD 17)
+  elseif(NOT Torch_VERSION VERSION_LESS "1.5.0")
+    set_if_higher(CMAKE_CXX_STANDARD 14)
+  endif()
+  include_directories(${TORCH_INCLUDE_DIRS})
+  if(MKL_FOUND)
+    list(PREPEND math_libs ${TORCH_LIBRARIES})
+  else()
+    list(APPEND math_libs ${TORCH_LIBRARIES})
+  endif()
+  add_compile_options(${TORCH_CXX_FLAGS})
+endif()
+
 if (ENABLE_CNPY)
   find_path(cnpy_SOURCE_DIR
     cnpy.h
@@ -590,13 +605,14 @@ if(DEFINED DeePMD_DIR)
     add_compile_definitions(__DPMDC)
   else()
     target_link_libraries(${ABACUS_BIN_NAME} DeePMD::deepmd_cc)
-    if(NOT DEFINED TensorFlow_DIR)
-      set(TensorFlow_DIR ${DeePMD_DIR})
-    endif()
-    find_package(TensorFlow REQUIRED)
-    if(TensorFlow_FOUND)
-      target_link_libraries(${ABACUS_BIN_NAME} TensorFlow::tensorflow_cc)
-    endif()
+  endif()
+endif()
+
+if(DEFINED TensorFlow_DIR)
+  find_package(TensorFlow REQUIRED)
+  include_directories(${TensorFlow_DIR}/include)
+  if(TensorFlow_FOUND)
+    target_link_libraries(${ABACUS_BIN_NAME} TensorFlow::tensorflow_cc)
   endif()
 endif()
 
 
@@ -36,7 +36,7 @@ The ABACUS program will automatically determine whether the current ELPA support
 ## Run with the GPU support by editing the INPUT script:
 
 In `INPUT` file we need to set the input parameter [device](../input_files/input-main.md#device) to `gpu`. If this parameter is not set, ABACUS will try to determine if there are available GPUs.
-- Set `ks_solver`: For the PW basis, CG, BPCG and Davidson methods are supported on GPU; set the input parameter [ks_solver](../input_files/input-main.md#ks_solver) to `cg`, `bpcg` or `dav`. For the LCAO basis, `cusolver` and `elpa` is supported on GPU.
+- Set `ks_solver`: For the PW basis, CG, BPCG and Davidson methods are supported on GPU; set the input parameter [ks_solver](../input_files/input-main.md#ks_solver) to `cg`, `bpcg` or `dav`. For the LCAO basis, `cusolver`, `cusolvermp` and `elpa` is supported on GPU.
 - **multi-card**: ABACUS allows for multi-GPU acceleration. If you have multiple GPU cards, you can run ABACUS with several MPI processes, and each process will utilize one GPU card. For example, the command `mpirun -n 2 abacus` will by default launch two GPUs for computation. If you only have one card, this command will only start one GPU. 
 
 ## Examples
 
@@ -23,7 +23,7 @@ pseudo_dir      ../../../tests/PP_ORB  //the path to locate the pesudopotential
 orbital_dir     ../../../tests/PP_ORB  //the path to locate the numerical orbital files
 ntype         3
 ecutwfc       50 // Ry
-symmetry      0 // turn off symmetry
+symmetry      -1 // turn off symmetry
 calculation   nscf // non-self-consistent calculation
 basis_type    lcao // atomic basis
 init_chg  file // read charge from files
@@ -70,4 +70,6 @@ The results are shown as follows:
  P =    0.8906925  (mod    2.1748536)  (   0.0000000,   0.0000000,   0.8906925) C/m^2
 ```
 
-The electric polarization **P** is multivalued, which modulo a quantum e**R**/V~cell~. Note: the values in parentheses are the components of the **P** along the c axis in the x, y, z Cartesian coordinates when set gdir = 3 in INPUT file.
+The electric polarization **P** is multivalued, which modulo a quantum e**R**/V~cell~. 
+
+Note: The vectors R1, R2, and R3 refer to the three lattice vectors of the unit cell. When gdir=3, the calculated polarization is along the R3 direction. The three values in parentheses represent the re-projection of the polarization along the R3 direction onto the Cartesian coordinate system (i.e., the xyz coordinate system). To obtain the full polarization components in the Cartesian system, you need to calculate the polarization for R1, R2, and R3 separately, and then sum their respective x, y, and z components.
@@ -39,7 +39,6 @@
     - [pw\_diag\_thr](#pw_diag_thr)
     - [pw\_diag\_nmax](#pw_diag_nmax)
     - [pw\_diag\_ndim](#pw_diag_ndim)
-    - [diago\_full\_acc](#diago_full_acc)
     - [erf\_ecut](#erf_ecut)
     - [fft\_mode](#fft_mode)
     - [erf\_height](#erf_height)
@@ -779,12 +778,6 @@ These variables are used to control the plane wave related parameters.
 - **Description**: Only useful when you use `ks_solver = dav` or `ks_solver = dav_subspace`. It indicates dimension of workspace(number of wavefunction packets, at least 2 needed) for the Davidson method. A larger value may yield a smaller number of iterations in the algorithm but uses more memory and more CPU time in subspace diagonalization.
 - **Default**: 4
 
-### diago_full_acc
-
-- **Type**: bool
-- **Description**: Only useful when you use `ks_solver = dav_subspace`. If `TRUE`, all the empty states are diagonalized at the same level of accuracy of the occupied ones. Otherwise the empty states are diagonalized using a larger threshold (10-5) (this should not affect total energy, forces, and other ground-state properties).
-- **Default**: false
-
 ### erf_ecut
 
 - **Type**: Real
@@ -925,14 +918,16 @@ calculations.
   - **cg**: cg method.
   - **bpcg**: bpcg method, which is a block-parallel Conjugate Gradient (CG) method, typically exhibits higher acceleration in a GPU environment.
   - **dav**: the Davidson algorithm.
-  - **dav_subspace**: subspace Davidson algorithm
+  - **dav_subspace**: Davidson algorithm without orthogonalization operation, this method is the most recommended for efficiency. `pw_diag_ndim` can be set to 2 for this method.
 
   For atomic orbitals basis,
 
   - **lapack**: This method is only avaliable for serial version. For parallel version please use **scalapack_gvx**.
   - **genelpa**: This method should be used if you choose localized orbitals.
   - **scalapack_gvx**: Scalapack can also be used for localized orbitals.
   - **cusolver**: This method needs building with CUDA and at least one gpu is available.
+  - **cusolvermp**: This method supports multi-GPU acceleration and needs building with CUDA。 Note that when using cusolvermp, you should set the number of MPI processes to be equal to the number of GPUs.
+  - **elpa**: The ELPA solver supports both CPU and GPU. By setting the `device` to GPU, you can launch the ELPA solver with GPU acceleration (provided that you have installed a GPU-supported version of ELPA, which requires you to manually compile and install ELPA, and the ABACUS should be compiled with -DUSE_ELPA=ON and -DUSE_CUDA=ON). The ELPA solver also supports multi-GPU acceleration.
 
   If you set ks_solver=`genelpa` for basis_type=`pw`, the program will be stopped with an error message:
 
@@ -941,7 +936,13 @@ calculations.
   ```
 
   Then the user has to correct the input file and restart the calculation.
-- **Default**: cg (plane-wave basis), or genelpa (localized atomic orbital basis, if compiling option `USE_ELPA` has been set),lapack (localized atomic orbital basis, if compiling option `ENABLE_MPI` has not been set), scalapack_gvx, (localized atomic orbital basis, if compiling option `USE_ELPA` has not been set and if compiling option `ENABLE_MPI` has been set)
+- **Default**: 
+  - **PW basis**: cg.
+  - **LCAO basis**:
+    - genelpa (if compiling option `USE_ELPA` has been set)
+    - lapack (if compiling option `ENABLE_MPI` has not been set)
+    - scalapack_gvx (if compiling option `USE_ELPA` has not been set and compiling option `ENABLE_MPI` has been set)
+    - cusolver (if compiling option `USE_CUDA` has been set)
 
 ### nbands
 
@@ -3956,6 +3957,13 @@ Currently supported: `RPA`, `LDA`, `PBE`, `HSE`, `HF`.
 - **Description**:  The number of 2-particle states to be solved
 - **Default**: 0
 
+### lr_unrestricted
+- **Type**: Boolean
+- **Description**: Whether to use unrestricted construction for LR-TDDFT (the matrix size will be doubled).
+  - True:  Always use unrestricted LR-TDDFT. 
+  - False: Use unrestricted LR-TDDFT only when the system is open-shell.
+- **Default**: False
+
 ### abs_wavelen_range
 
 - **Type**: Real Real
 
@@ -8,7 +8,7 @@ ABACUS uses periodic boundary conditions for both crystals and finite systems. F
 
 ## Gamma-only Calculations
 
-In ABACUS, we offer th option of running gamma-only calculations for LCAO basis by setting [gamma_only](./input-main.md#gamma_only) to be 1. Due to details of implementation, gamma-only calculation will be slightly faster than running a non gamma-only calculation and explicitly setting gamma point to be the only the k-point, but the results should be consistent.
+In ABACUS, we offer the option of running gamma-only calculations for LCAO basis by setting [gamma_only](./input-main.md#gamma_only) to be 1. Due to details of implementation, gamma-only calculation will be slightly faster than running a non gamma-only calculation and explicitly setting gamma point to be the only the k-point, but the results should be consistent.
 
 > If gamma_only is set to 1, the KPT file will be overwritten. So make sure to turn off gamma_only for multi-k calculations.
 
@@ -23,7 +23,7 @@ method to generate k-mesh, and the following is an example input k-point (`KPT`)
 K_POINTS //keyword for start
 0 //total number of k-point, `0' means generate automatically
 Gamma //which kind of Monkhorst-Pack method, `Gamma' or `MP'
-2 2 2 0 0 0 //first three number: subdivisions along recpri. vectors
+2 2 2 0 0 0 //first three number: subdivisions along reciprocal vectors
             //last three number: shift of the mesh
 ```
 
@@ -63,8 +63,8 @@ Direct //`Direct' or `Cartesian' coordinate
 ## Band structure calculations
 
 ABACUS uses specified high-symmetry directions of the Brillouin zone for band structure
-calculations. The third line of k-point file should start with ‘Line’ or ‘Line_Cartesian’ for
-line mode. ‘Line’ means the positions below are in Direct coordinates, while ‘Line_Cartesian’
+calculations. The third line of k-point file should start with 'Line' or 'Line_Cartesian' for
+line mode. 'Line' means the positions below are in Direct coordinates, while 'Line_Cartesian'
 means in Cartesian coordinates:
 
 ```
 
@@ -38,13 +38,22 @@ cmake -B build -DENABLE_DEEPKS=1 -DTorch_DIR=~/libtorch/share/cmake/Torch/ -Dlib
 If the Deep Potential model is employed in Molecule Dynamics calculations, the following prerequisites and steps are needed:
 
 - [DeePMD-kit](https://github.com/deepmodeling/deepmd-kit)
-- [TensorFlow](https://www.tensorflow.org/)
+- [TensorFlow](https://www.tensorflow.org/) (optional)
+- [LibTorch](https://pytorch.org/) (optional)
 
+In the simplest case, the `tensorflow_cc` and `torch` libraries are in the same directory as the `deepmd_c`/`deepmd_cc` libraries, then
 ```bash
-cmake -B build -DDeePMD_DIR=~/deepmd-kit -DTensorFlow_DIR=~/tensorflow
+cmake -B build -DDeePMD_DIR=/dir_to_deepmd-kit
 ```
+DeePMD-kit supports TensorFlow backend but its libraries are placed at another directory, then
 
-> `deepmd_c`/`deepmd_cc` and `tensorflow_cc` libraries would be called according to `DeePMD_DIR` and `TensorFlow_DIR`, which is showed in detail in [this page](https://github.com/deepmodeling/deepmd-kit/blob/master/doc/inference/cxx.md). If `TensorFlow_DIR` is not defined, it will be the same as `DeePMD_DIR`. Note that `tensorflow_cc` is not required if `deepmd_c` is found.
+```bash
+cmake -B build -DDeePMD_DIR=/dir_to_deepmd-kit -DTensorFlow_DIR=/dir_to_tensorflow
+```
+Similarly, DeePMD-kit supports PyTorch backend but its libraries are placed at another directory, then
+```bash
+cmake -B build -DDeePMD_DIR=/dir_to_deepmd-kit -DTorch_DIR=/dir_to_pytorch
+```
 
 ## Build with LibRI and LibComm
 
@@ -93,9 +102,9 @@ cmake -B build -DUSE_CUDA=1 -DCMAKE_CUDA_COMPILER=${path to cuda toolkit}/bin/nv
 
 ## Build math library from source
 
-> Note: This flag is **enabled by default**. It will get better performance than the standard implementation on `gcc` and `clang`. But it **will be disabled** when using `Intel Compiler` since the math functions will get wrong results and the performance is also unexpectly poor.
+> Note: We recommend using the latest available compiler sets, since they offer faster implementations of math functions.
 
-To build math functions from source code, instead of using c++ standard implementation, define `USE_ABACUS_LIBM` flag.
+This flag is disabled by default. To build math functions from source code, define `USE_ABACUS_LIBM` flag. It is expected to get a better performance on legacy versions of `gcc` and `clang`.
 
 Currently supported math functions:
  `sin`, `cos`, `sincos`, `exp`, `cexp`
@@ -282,15 +291,21 @@ directly.
 
 > Note: This part is only required if you want to load a trained DeeP Potential and run molecular dynamics with that. To train the DeeP Potential with DP-GEN, no extra prerequisite is needed and please refer to [this page](http://abacus.deepmodeling.com/en/latest/advanced/interface/dpgen.html) for ABACUS interface with DP-GEN.
 
-To compile ABACUS with DeePMD-kit, you need to define `DeePMD_DIR` and `TensorFlow_DIR` in the file `Makefile.vars` or use
+To compile ABACUS with DeePMD-kit, you need to define `DeePMD_DIR` and `TensorFlow_DIR` (TensorFlow Backend, optional) and/or `LIBTORCH_DIR` (PyTorch Backend, optional) in the file `Makefile.vars`.
 
+Or the `tensorflow_cc` and `torch` libraries are in the same directory as the `deepmd_c`/`deepmd_cc` libraries, then
 ```makefile
-make DeePMD_DIR=~/deepmd-kit TensorFlow_DIR=~/tensorflow
+make DeePMD_DIR=/dir_to_deepmd-kit
 ```
+DeePMD-kit supports TensorFlow backend but its libraries are placed at another directory, then
 
-directly.
-
-> `deepmd_c`/`deepmd_cc` and `tensorflow_cc` libraries would be called according to `DeePMD_DIR` and `TensorFlow_DIR`, which is showed in detail in [this page](https://github.com/deepmodeling/deepmd-kit/blob/master/doc/inference/cxx.md). If `TensorFlow_DIR` is not defined, it will be the same as `DeePMD_DIR`. Note that `tensorflow_cc` is not required if `deepmd_c` is found.
+```makefile
+make DeePMD_DIR=/dir_to_deepmd-kit TensorFlow_DIR=/dir_to_tensorflow
+```
+Similarly, DeePMD-kit supports PyTorch backend but its libraries are placed at another directory, then
+```makefile
+make DeePMD_DIR=/dir_to_deepmd-kit Torch_DIR=/dir_to_pytorch
+```
 
 ### Add LibRI Support
 To use new EXX, you need two libraries: [LibRI](https://github.com/abacusmodeling/LibRI) and [LibComm](https://github.com/abacusmodeling/LibComm) and need to define `LIBRI_DIR` and `LIBCOMM_DIR` in the file `Makefile.vars` or use