deepmodeling
diff --git a/‎.github/workflows/coverage.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/coverage.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/test.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/test.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.gitignore‎
Lines changed: 0 additions & 1 deletion b/‎.gitignore‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎CMakeLists.txt‎
Lines changed: 33 additions & 31 deletions b/‎CMakeLists.txt‎
Lines changed: 33 additions & 31 deletions
diff --git a/‎Dockerfile.cuda‎
Lines changed: 13 additions & 2 deletions b/‎Dockerfile.cuda‎
Lines changed: 13 additions & 2 deletions
diff --git a/‎deps/LibRI‎ b/‎deps/LibRI‎
diff --git a/‎docs/advanced/acceleration/cuda.md‎
Lines changed: 9 additions & 2 deletions b/‎docs/advanced/acceleration/cuda.md‎
Lines changed: 9 additions & 2 deletions
diff --git a/‎docs/advanced/elec_properties/Berry_phase.md‎
Lines changed: 4 additions & 2 deletions b/‎docs/advanced/elec_properties/Berry_phase.md‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎docs/advanced/elec_properties/charge.md‎
Lines changed: 2 additions & 2 deletions b/‎docs/advanced/elec_properties/charge.md‎
Lines changed: 2 additions & 2 deletions
@@ -15,10 +15,10 @@ jobs:
         uses: actions/checkout@v4
       - name: Install Requirements for Coverage Testing
         run: |
-          apt update && apt install -y lcov
+          apt update && apt install -y lcov gpg
       - name: Building
         run: |
-          cmake -B build -DENABLE_DEEPKS=ON -DENABLE_LIBXC=ON -DBUILD_TESTING=ON -DENABLE_COVERAGE=ON
+          cmake -B build -DENABLE_COVERAGE=ON -DBUILD_TESTING=ON -DENABLE_DEEPKS=ON -DENABLE_LIBXC=ON -DENABLE_LIBRI=ON -DENABLE_PAW=ON -DENABLE_GOOGLEBENCH=ON -DENABLE_RAPIDJSON=ON
           cmake --build build -j`nproc`
           cmake --install build
       - name: Testing
 
@@ -39,7 +39,7 @@ jobs:
             --from-ref ${{ github.event.pull_request.base.sha }}
               --to-ref ${{ github.event.pull_request.head.sha }}
         continue-on-error: true
-      - uses: pre-commit-ci/[email protected].2
+      - uses: pre-commit-ci/[email protected].3
 
       - name: Build
         run: |
 
@@ -23,4 +23,3 @@ time.json
 __pycache__
 abacus.json
 *.npy
-
@@ -238,11 +238,6 @@ if(ENABLE_MPI)
   target_link_libraries(${ABACUS_BIN_NAME} MPI::MPI_CXX)
   add_compile_definitions(__MPI)
   list(APPEND math_libs MPI::MPI_CXX)
-else()
-  find_package(MPI REQUIRED)
-  include_directories(${MPI_CXX_INCLUDE_PATH})
-  target_link_libraries(${ABACUS_BIN_NAME} MPI::MPI_CXX)
-  list(APPEND math_libs MPI::MPI_CXX)
 endif()
 
 find_package(Threads REQUIRED)
@@ -441,22 +436,6 @@ if(ENABLE_FLOAT_FFTW)
 endif()
 
 if(ENABLE_DEEPKS)
-  # Torch uses outdated components to detect CUDA arch, causing failure on
-  # latest CUDA kits. Set CMake variable TORCH_CUDA_ARCH_LIST in the form of
-  # "major.minor" if required.
-  find_package(Torch REQUIRED)
-  if(NOT Torch_VERSION VERSION_LESS "2.1.0")
-    set_if_higher(CMAKE_CXX_STANDARD 17)
-  elseif(NOT Torch_VERSION VERSION_LESS "1.5.0")
-    set_if_higher(CMAKE_CXX_STANDARD 14)
-  endif()
-  include_directories(${TORCH_INCLUDE_DIRS})
-  if(MKL_FOUND)
-    list(PREPEND math_libs ${TORCH_LIBRARIES})
-  else()
-    list(APPEND math_libs ${TORCH_LIBRARIES})
-  endif()
-  add_compile_options(${TORCH_CXX_FLAGS})
   target_link_libraries(${ABACUS_BIN_NAME} deepks)
 
   find_path(libnpy_SOURCE_DIR npy.hpp HINTS ${libnpy_INCLUDE_DIR})
@@ -475,6 +454,25 @@ if(ENABLE_DEEPKS)
   add_compile_definitions(__DEEPKS)
 endif()
 
+# Torch uses outdated components to detect CUDA arch, causing failure on
+# latest CUDA kits. Set CMake variable TORCH_CUDA_ARCH_LIST in the form of
+# "major.minor" if required.
+if(ENABLE_DEEPKS OR DEFINED Torch_DIR)
+  find_package(Torch REQUIRED)
+  if(NOT Torch_VERSION VERSION_LESS "2.1.0")
+    set_if_higher(CMAKE_CXX_STANDARD 17)
+  elseif(NOT Torch_VERSION VERSION_LESS "1.5.0")
+    set_if_higher(CMAKE_CXX_STANDARD 14)
+  endif()
+  include_directories(${TORCH_INCLUDE_DIRS})
+  if(MKL_FOUND)
+    list(PREPEND math_libs ${TORCH_LIBRARIES})
+  else()
+    list(APPEND math_libs ${TORCH_LIBRARIES})
+  endif()
+  add_compile_options(${TORCH_CXX_FLAGS})
+endif()
+
 if (ENABLE_CNPY)
   find_path(cnpy_SOURCE_DIR
     cnpy.h
@@ -492,7 +490,10 @@ if (ENABLE_CNPY)
     include_directories(${cnpy_INCLUDE_DIR})
   endif()
   include_directories(${cnpy_SOURCE_DIR})
-  target_link_libraries(${ABACUS_BIN_NAME} cnpy)
+  
+  # find ZLIB and link
+  find_package(ZLIB REQUIRED)
+  target_link_libraries(${ABACUS_BIN_NAME} cnpy ZLIB::ZLIB)
   add_compile_definitions(__USECNPY)
 endif()
 
@@ -532,7 +533,7 @@ if(ENABLE_LIBRI)
   else()
     message(FATAL_ERROR "Must provide LIBRI_DIR for RI related features.")
   endif()
-  target_link_libraries(${ABACUS_BIN_NAME} ri)
+  target_link_libraries(${ABACUS_BIN_NAME} ri module_exx_symmetry)
   add_compile_definitions(__EXX EXX_DM=3 EXX_H_COMM=2 TEST_EXX_LCAO=0
                           TEST_EXX_RADIAL=1)
 endif()
@@ -592,13 +593,14 @@ if(DEFINED DeePMD_DIR)
     add_compile_definitions(__DPMDC)
   else()
     target_link_libraries(${ABACUS_BIN_NAME} DeePMD::deepmd_cc)
-    if(NOT DEFINED TensorFlow_DIR)
-      set(TensorFlow_DIR ${DeePMD_DIR})
-    endif()
-    find_package(TensorFlow REQUIRED)
-    if(TensorFlow_FOUND)
-      target_link_libraries(${ABACUS_BIN_NAME} TensorFlow::tensorflow_cc)
-    endif()
+  endif()
+endif()
+
+if(DEFINED TensorFlow_DIR)
+  find_package(TensorFlow REQUIRED)
+  include_directories(${TensorFlow_DIR}/include)
+  if(TensorFlow_FOUND)
+    target_link_libraries(${ABACUS_BIN_NAME} TensorFlow::tensorflow_cc)
   endif()
 endif()
 
@@ -680,6 +682,7 @@ add_subdirectory(source)
 target_link_libraries(
   ${ABACUS_BIN_NAME}
   base
+  parameter
   cell
   symmetry
   md
@@ -702,7 +705,6 @@ target_link_libraries(
   psi_initializer
   esolver
   vdw
-  parameter
   device
   container)
 if(ENABLE_LCAO)
 
@@ -1,9 +1,9 @@
 FROM nvidia/cuda:12.2.0-devel-ubuntu22.04
 
 RUN apt update && apt install -y --no-install-recommends \
-    libopenblas-openmp-dev liblapack-dev libscalapack-mpi-dev libelpa-dev libfftw3-dev libcereal-dev \
+    libopenblas-openmp-dev liblapack-dev libscalapack-mpi-dev libfftw3-dev libcereal-dev \
     libxc-dev libgtest-dev libgmock-dev libbenchmark-dev python3-numpy \
-    bc cmake git g++ make bc time sudo unzip vim wget
+    bc cmake git g++ make bc time sudo unzip vim wget libopenmpi-dev gfortran libtool-bin
 
 ENV GIT_SSL_NO_VERIFY=true TERM=xterm-256color \
     OMPI_ALLOW_RUN_AS_ROOT=1 OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 \
@@ -13,6 +13,17 @@ RUN git clone https://github.com/llohse/libnpy.git && \
     cp libnpy/include/npy.hpp /usr/local/include && \
     rm -r libnpy
 
+RUN cd /tmp && \
+    ELPA_VER=2024.05.001 && \
+    wget -q https://elpa.mpcdf.mpg.de/software/tarball-archive/Releases/$ELPA_VER/elpa-$ELPA_VER.tar.gz && \
+    tar xzf elpa-$ELPA_VER.tar.gz  && rm elpa-$ELPA_VER.tar.gz && \
+    cd elpa-$ELPA_VER && \
+    ./configure CXX=mpic++ CFLAGS="-O3 -march=native" FCFLAGS="-O3" LDFLAGS="-L/usr/local/cuda/lib64 -lstdc++" NVCCFLAGS="-arch sm_75 -arch sm_80" --enable-openmp --enable-nvidia-gpu --with-NVIDIA-GPU-compute-capability="sm_70" --with-cuda-path=/usr/local/cuda/ && \
+    make -j`nproc` && \
+    make PREFIX=/usr/local install && \
+    ln -s /usr/local/include/elpa_openmp-$ELPA_VER/elpa /usr/local/include/ && \
+    cd /tmp && rm -rf elpa-$ELPA_VER
+
 ADD https://api.github.com/repos/deepmodeling/abacus-develop/git/refs/heads/develop /dev/null
 
 RUN git clone https://github.com/deepmodeling/abacus-develop.git --depth 1 && \
 
@@ -29,11 +29,15 @@ To compile and use ABACUS in CUDA mode, you currently need to have an NVIDIA GPU
 
 Check the [Advanced Installation Options](https://abacus-rtd.readthedocs.io/en/latest/advanced/install.html#build-with-cuda-support) for the installation of CUDA version support.
 
+Setting both USE_ELPA and USE_CUDA to ON does not automatically enable ELPA to run on GPUs. ELPA support for GPUs needs to be enabled when ELPA is compiled. [enable GPU support](https://github.com/marekandreas/elpa/blob/master/documentation/INSTALL.md).
+
+The ABACUS program will automatically determine whether the current ELPA supports GPU based on the elpa/elpa_configured_options.h header file. Users can also check this header file to determine the GPU support of ELPA in their environment. ELPA introduced a new API elpa_setup_gpu in version 2023.11.001. So if you want to enable ELPA GPU in ABACUS, the ELPA version must be greater than or equal to 2023.11.001.
+
 ## Run with the GPU support by editing the INPUT script:
 
 In `INPUT` file we need to set the input parameter [device](../input_files/input-main.md#device) to `gpu`. If this parameter is not set, ABACUS will try to determine if there are available GPUs.
-- Set `ks_solver`: For the PW basis, CG, BPCG and Davidson methods are supported on GPU; set the input parameter [ks_solver](../input_files/input-main.md#ks_solver) to `cg`, `bpcg` or `dav`. For the LCAO basis, `cusolver` is supported on GPU.
-- **multi-card**: ABACUS allows for multi-GPU acceleration. If you have multiple GPU cards, you can run ABACUS with several MPI processes, and each process will utilize one GPU card. For example, the command `mpirun -n 2 abacus` will by default launch two GPUs for computation. If you only have one card, this command will only start one GPU.
+- Set `ks_solver`: For the PW basis, CG, BPCG and Davidson methods are supported on GPU; set the input parameter [ks_solver](../input_files/input-main.md#ks_solver) to `cg`, `bpcg` or `dav`. For the LCAO basis, `cusolver`, `cusolvermp` and `elpa` is supported on GPU.
+- **multi-card**: ABACUS allows for multi-GPU acceleration. If you have multiple GPU cards, you can run ABACUS with several MPI processes, and each process will utilize one GPU card. For example, the command `mpirun -n 2 abacus` will by default launch two GPUs for computation. If you only have one card, this command will only start one GPU. 
 
 ## Examples
 We provides [examples](https://github.com/deepmodeling/abacus-develop/tree/develop/examples/gpu) of gpu calculations.
@@ -42,3 +46,6 @@ We provides [examples](https://github.com/deepmodeling/abacus-develop/tree/devel
 PW basis:
 - Only k point parallelization is supported, so the input keyword `kpar` will be set to match the number of MPI tasks automatically.
 - By default, CUDA architectures 60, 70, 75, 80, 86, and 89 are compiled (if supported). It can be overriden using the CMake variable [`CMAKE_CUDA_ARCHITECTURES`](https://cmake.org/cmake/help/latest/variable/CMAKE_CUDA_ARCHITECTURES.html) or the environmental variable [`CUDAARCHS`](https://cmake.org/cmake/help/latest/envvar/CUDAARCHS.html).
+LCAO basis:
+- Unless there is a specific reason, avoid using multiple GPUs, as it can be slower than using a single GPU. This is because the generalized eigenvalue solution of the LCAO basis set will incur additional communication overhead when calculated on multiple cards. When the memory limit of a GPU card makes it insufficient to complete the task, it is recommended to use multiple cards for calculation.
+- When using elpa on GPUs, some ELPA internal logs will be output.
@@ -23,7 +23,7 @@ pseudo_dir      ../../../tests/PP_ORB  //the path to locate the pesudopotential
 orbital_dir     ../../../tests/PP_ORB  //the path to locate the numerical orbital files
 ntype         3
 ecutwfc       50 // Ry
-symmetry      0 // turn off symmetry
+symmetry      -1 // turn off symmetry
 calculation   nscf // non-self-consistent calculation
 basis_type    lcao // atomic basis
 init_chg  file // read charge from files
@@ -70,4 +70,6 @@ The results are shown as follows:
  P =    0.8906925  (mod    2.1748536)  (   0.0000000,   0.0000000,   0.8906925) C/m^2
 ```
 
-The electric polarization **P** is multivalued, which modulo a quantum e**R**/V~cell~. Note: the values in parentheses are the components of the **P** along the c axis in the x, y, z Cartesian coordinates when set gdir = 3 in INPUT file.
+The electric polarization **P** is multivalued, which modulo a quantum e**R**/V~cell~. 
+
+Note: The vectors R1, R2, and R3 refer to the three lattice vectors of the unit cell. When gdir=3, the calculated polarization is along the R3 direction. The three values in parentheses represent the re-projection of the polarization along the R3 direction onto the Cartesian coordinate system (i.e., the xyz coordinate system). To obtain the full polarization components in the Cartesian system, you need to calculate the polarization for R1, R2, and R3 separately, and then sum their respective x, y, and z components.
@@ -10,7 +10,7 @@ After finishing the calculation, the information of the charge density is stroed
 The SPIN${spin}_CHG.cube file looks like:
 
 ```
-Cubefile created from ABACUS SCF calculation
+STEP: 0  Cubefile created from ABACUS. Inner loop is z, followed by y and x
 2 (nspin) 0.914047 (fermi energy, in Ry)
 2 0.0 0.0 0.0 
 27 0.222222 0 0
@@ -31,7 +31,7 @@ Cubefile created from ABACUS SCF calculation
  ...
 ```
 
-The first line is a brief description.\
+The first line contains the current ion step.\
 The second line contains NSPIN and Fermi energy.\
 The following 4 lines are the informations of lattice, in order:\
 &emsp;total number of atoms, the coordinate of original point.\