From 79296756758bee55285504fbbf05c525cac143eb Mon Sep 17 00:00:00 2001
From: tang070205 <tang070205@proton.me>
Date: Sun, 16 Mar 2025 13:23:59 +0800
Subject: [PATCH 01/19] Add optional LCAO base GPU versions supported by
 cusolvermp

---
 toolchain/build_abacus_gnu.sh | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/toolchain/build_abacus_gnu.sh b/toolchain/build_abacus_gnu.sh
index 27328c7eec..4c17e9538b 100755
--- a/toolchain/build_abacus_gnu.sh
+++ b/toolchain/build_abacus_gnu.sh
@@ -55,7 +55,9 @@ cmake -B $BUILD_DIR -DCMAKE_INSTALL_PREFIX=$PREFIX \
 #         -DENABLE_LIBRI=ON \
 #         -DLIBRI_DIR=$LIBRI \
 #         -DLIBCOMM_DIR=$LIBCOMM \
-# 	      -DDeePMD_DIR=$DEEPMD \
+# 	      -DUSE_CUDA=ON \
+#         -DENABLE_CUSOLVERMP=ON \
+#         -D CAL_CUSOLVERMP_PATH=/opt/nvidia/hpc_sdk/Linux_x86_64/2x.xx/math_libs/1x.x/targets/x86_64-linux/lib
 
 # # add mkl env for libtorch to link
 # if one want to install libtorch, mkl should be load in build process
@@ -81,4 +83,4 @@ Done!
 To use the installed ABACUS version
 You need to source ${TOOL}/abacus_env.sh first !
 """
-EOF
\ No newline at end of file
+EOF

From 9b8ed70db597f95d91da42763c9dc6f4104c284b Mon Sep 17 00:00:00 2001
From: tang070205 <tang070205@proton.me>
Date: Sun, 16 Mar 2025 15:08:15 +0800
Subject: [PATCH 02/19] Add optional LCAO base GPU versions supported by elpa

---
 toolchain/build_abacus_gnu.sh | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/toolchain/build_abacus_gnu.sh b/toolchain/build_abacus_gnu.sh
index 4c17e9538b..23832aa683 100755
--- a/toolchain/build_abacus_gnu.sh
+++ b/toolchain/build_abacus_gnu.sh
@@ -24,6 +24,7 @@ PREFIX=$ABACUS_DIR
 LAPACK=$INSTALL_DIR/openblas-0.3.28/lib
 SCALAPACK=$INSTALL_DIR/scalapack-2.2.1/lib
 ELPA=$INSTALL_DIR/elpa-2025.01.001/cpu
+#ELPA_LINK=$TOOL/build/elpa-2025.01.001/build_nvidia
 FFTW3=$INSTALL_DIR/fftw-3.3.10
 CEREAL=$INSTALL_DIR/cereal-1.3.2/include/cereal
 LIBXC=$INSTALL_DIR/libxc-7.0.0
@@ -58,6 +59,10 @@ cmake -B $BUILD_DIR -DCMAKE_INSTALL_PREFIX=$PREFIX \
 # 	      -DUSE_CUDA=ON \
 #         -DENABLE_CUSOLVERMP=ON \
 #         -D CAL_CUSOLVERMP_PATH=/opt/nvidia/hpc_sdk/Linux_x86_64/2x.xx/math_libs/1x.x/targets/x86_64-linux/lib
+#         -DUSE_ELPA=ON \
+#         -DELPA_LINK_LIBRARIES=$ELPA_LINK/.libs/libelpa_openmp.so \
+
+
 
 # # add mkl env for libtorch to link
 # if one want to install libtorch, mkl should be load in build process

From 0a6a0973fbac9c270096ff7a9dff6ff4b4785a9b Mon Sep 17 00:00:00 2001
From: tang070205 <tang070205@proton.me>
Date: Sun, 16 Mar 2025 15:09:27 +0800
Subject: [PATCH 03/19] Add optional LCAO base GPU versions supported by elpa

---
 toolchain/toolchain_gnu.sh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/toolchain/toolchain_gnu.sh b/toolchain/toolchain_gnu.sh
index bf5be6a129..93aba3b677 100755
--- a/toolchain/toolchain_gnu.sh
+++ b/toolchain/toolchain_gnu.sh
@@ -30,5 +30,7 @@
 --with-libri=no \
 --with-libcomm=no \
 --with-4th-openmpi=no \
+#--enable-cuda \
+#--gpu-ver=L40S \
 | tee compile.log
-# if you want to use openmpi-version4: set --with-4th-openmpi=yes
\ No newline at end of file
+# if you want to use openmpi-version4: set --with-4th-openmpi=yes

From 471d6f37a1b7ac033f0ed369b2c35ccd80577bf2 Mon Sep 17 00:00:00 2001
From: tang070205 <tang070205@proton.me>
Date: Sun, 16 Mar 2025 15:11:54 +0800
Subject: [PATCH 04/19] Add L40S as GPUVER value for sm_89 architecture

---
 toolchain/install_abacus_toolchain.sh | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/toolchain/install_abacus_toolchain.sh b/toolchain/install_abacus_toolchain.sh
index 2ed465f646..8bd864a154 100755
--- a/toolchain/install_abacus_toolchain.sh
+++ b/toolchain/install_abacus_toolchain.sh
@@ -449,12 +449,13 @@ while [ $# -ge 1 ]; do
     --gpu-ver=*)
       user_input="${1#*=}"
       case "${user_input}" in
-        K20X | K40 | K80 | P100 | V100 | A100 | Mi50 | Mi100 | Mi250 | no)
+        K20X | K40 | K80 | P100 | V100 | A100 | L40S | Mi50 | Mi100 | Mi250 | no)
           export GPUVER="${user_input}"
+          export TARGET="nvidia"
           ;;
         *)
           report_error ${LINENO} \
-            "--gpu-ver currently only supports K20X, K40, K80, P100, V100, A100, Mi50, Mi100, Mi250, and no as options"
+            "--gpu-ver currently only supports K20X, K40, K80, P100, V100, A100, L40S, Mi50, Mi100, Mi250, and no as options"
           exit 1
           ;;
       esac
@@ -836,6 +837,9 @@ case ${GPUVER} in
   A100)
     export ARCH_NUM="80"
     ;;
+  L40S)
+    export ARCH_NUM="89"
+    ;;
   Mi50)
     # TODO: export ARCH_NUM=
     ;;

From 0bbf6f235da5b55b16bb788e3087debe9bf7aaf3 Mon Sep 17 00:00:00 2001
From: tang070205 <tang070205@proton.me>
Date: Sun, 16 Mar 2025 15:13:51 +0800
Subject: [PATCH 05/19] Delete a few lines of content to enable Nvidia to
 compile

---
 toolchain/scripts/stage3/install_elpa.sh | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/toolchain/scripts/stage3/install_elpa.sh b/toolchain/scripts/stage3/install_elpa.sh
index 01e7980810..92b638fde9 100755
--- a/toolchain/scripts/stage3/install_elpa.sh
+++ b/toolchain/scripts/stage3/install_elpa.sh
@@ -138,16 +138,10 @@ case "$with_elpa" in
         ../configure --prefix="${pkg_install_dir}/${TARGET}/" \
           --libdir="${pkg_install_dir}/${TARGET}/lib" \
           --enable-openmp=${enable_openmp} \
-          --enable-shared=yes \
-          --enable-static=yes \
-          --disable-c-tests \
-          --disable-cpp-tests \
           ${config_flags} \
-          --enable-nvidia-gpu-kernels=$([ "$TARGET" = "nvidia" ] && echo "yes" || echo "no") \
+          --enable-nvidia-gpu-kernels \
           --with-cuda-path=${CUDA_PATH:-${CUDA_HOME:-/CUDA_HOME-notset}} \
           --with-NVIDIA-GPU-compute-capability=$([ "$TARGET" = "nvidia" ] && echo "sm_$ARCH_NUM" || echo "sm_75") \
-          CUDA_CFLAGS="-std=c++14 -allow-unsupported-compiler" \
-          OMPI_MCA_plm_rsh_agent=/bin/false \
           FC=${MPIFC} \
           CC=${MPICC} \
           CXX=${MPICXX} \

From b3defd5b1b1db8485396bc4b5ea265a71a7166ec Mon Sep 17 00:00:00 2001
From: tang070205 <tang070205@proton.me>
Date: Sun, 16 Mar 2025 15:16:22 +0800
Subject: [PATCH 06/19] Add a specified Fortran mpi compiler for elpa to use

---
 toolchain/scripts/stage1/install_openmpi.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/toolchain/scripts/stage1/install_openmpi.sh b/toolchain/scripts/stage1/install_openmpi.sh
index ab65a89553..ca8fccf7ca 100755
--- a/toolchain/scripts/stage1/install_openmpi.sh
+++ b/toolchain/scripts/stage1/install_openmpi.sh
@@ -80,6 +80,7 @@ case "${with_openmpi}" in
         --prefix=${pkg_install_dir} \
         --libdir="${pkg_install_dir}/lib" \
         --with-libevent=internal \
+        --enable-mpi-fortran FC=gfortran \
         ${EXTRA_CONFIGURE_FLAGS} \
         > configure.log 2>&1 || tail -n ${LOG_LINES} configure.log
       make -j $(get_nprocs) > make.log 2>&1 || tail -n ${LOG_LINES} make.log

From dad1705dd0ee19dd2ca3e395dcb498fbead4a678 Mon Sep 17 00:00:00 2001
From: tang070205 <tang070205@proton.me>
Date: Sun, 16 Mar 2025 15:18:13 +0800
Subject: [PATCH 07/19] Add CUDA path for use by ELPA-GPU

---
 toolchain/toolchain_gnu.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/toolchain/toolchain_gnu.sh b/toolchain/toolchain_gnu.sh
index 93aba3b677..32bfe31e9f 100755
--- a/toolchain/toolchain_gnu.sh
+++ b/toolchain/toolchain_gnu.sh
@@ -12,7 +12,7 @@
 # libtorch and libnpy are for deepks support, which can be =no
 # if you want to run EXX calculation, you should set --with-libri=install
 # mpich (and intel toolchain) is recommended for EXX support
-
+#export CUDA_PATH=/usr/local/cuda
 ./install_abacus_toolchain.sh \
 --with-gcc=system \
 --with-intel=no \

From 6807886d1439a872337e3f0e1a01ded197da6702 Mon Sep 17 00:00:00 2001
From: tang070205 <tang070205@proton.me>
Date: Sun, 16 Mar 2025 15:23:24 +0800
Subject: [PATCH 08/19] Add optional LCAO base GPU versions supported by elpa

---
 toolchain/build_abacus_gnu.sh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/toolchain/build_abacus_gnu.sh b/toolchain/build_abacus_gnu.sh
index 23832aa683..7db6f2ddda 100755
--- a/toolchain/build_abacus_gnu.sh
+++ b/toolchain/build_abacus_gnu.sh
@@ -56,14 +56,13 @@ cmake -B $BUILD_DIR -DCMAKE_INSTALL_PREFIX=$PREFIX \
 #         -DENABLE_LIBRI=ON \
 #         -DLIBRI_DIR=$LIBRI \
 #         -DLIBCOMM_DIR=$LIBCOMM \
+# 	      -DDeePMD_DIR=$DEEPMD \
 # 	      -DUSE_CUDA=ON \
 #         -DENABLE_CUSOLVERMP=ON \
 #         -D CAL_CUSOLVERMP_PATH=/opt/nvidia/hpc_sdk/Linux_x86_64/2x.xx/math_libs/1x.x/targets/x86_64-linux/lib
 #         -DUSE_ELPA=ON \
 #         -DELPA_LINK_LIBRARIES=$ELPA_LINK/.libs/libelpa_openmp.so \
 
-
-
 # # add mkl env for libtorch to link
 # if one want to install libtorch, mkl should be load in build process
 # for -lmkl when load libtorch

From a145e53f2065dbc85768d07c8088a4ff2b6cfa8f Mon Sep 17 00:00:00 2001
From: tang070205 <tang070205@proton.me>
Date: Sun, 16 Mar 2025 22:36:16 +0800
Subject: [PATCH 09/19] Modify a small issue

---
 toolchain/scripts/stage3/install_elpa.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/toolchain/scripts/stage3/install_elpa.sh b/toolchain/scripts/stage3/install_elpa.sh
index 92b638fde9..a46e7cae49 100755
--- a/toolchain/scripts/stage3/install_elpa.sh
+++ b/toolchain/scripts/stage3/install_elpa.sh
@@ -139,7 +139,7 @@ case "$with_elpa" in
           --libdir="${pkg_install_dir}/${TARGET}/lib" \
           --enable-openmp=${enable_openmp} \
           ${config_flags} \
-          --enable-nvidia-gpu-kernels \
+          --enable-nvidia-gpu-kernels=$([ "$TARGET" = "nvidia" ] && echo "yes" || echo "no") \
           --with-cuda-path=${CUDA_PATH:-${CUDA_HOME:-/CUDA_HOME-notset}} \
           --with-NVIDIA-GPU-compute-capability=$([ "$TARGET" = "nvidia" ] && echo "sm_$ARCH_NUM" || echo "sm_75") \
           FC=${MPIFC} \

From 5318a96f8c0624cea54ffa0c41a2e212b4d6ee92 Mon Sep 17 00:00:00 2001
From: tang070205 <tang070205@proton.me>
Date: Sun, 16 Mar 2025 22:38:44 +0800
Subject: [PATCH 10/19] Change to manually specifying the link libraries for
 CAL and cusolverMp

---
 CMakeLists.txt | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 87fbef8f8e..82730031b3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -352,9 +352,19 @@ if(USE_CUDA)
     endif()
     if (ENABLE_CUSOLVERMP)
       add_compile_definitions(__CUSOLVERMP)
+      find_library(CAL_LIBRARY
+          NAMES cal
+          PATHS ${CAL_CUSOLVERMP_PATH}
+          NO_DEFAULT_PATH
+      )
+      find_library(CUSOLVERMP_LIBRARY
+          NAMES cusolverMp
+          PATHS ${CAL_CUSOLVERMP_PATH}
+          NO_DEFAULT_PATH
+      )
       target_link_libraries(${ABACUS_BIN_NAME}
-          cal
-          cusolverMp
+          ${CAL_LIBRARY}
+          ${CUSOLVERMP_LIBRARY}
       )
     endif()
   endif()

From 25a2239d594151aba23cdb21e164b8853133172f Mon Sep 17 00:00:00 2001
From: tang070205 <tang070205@proton.me>
Date: Tue, 18 Mar 2025 15:12:14 +0800
Subject: [PATCH 11/19] Add the use of 'cusolvermp' or 'elpa' methods to
 compile ABACUS GPU-LCAO

---
 toolchain/README.md | 32 ++++++++++++++++++++++++++++++--
 1 file changed, 30 insertions(+), 2 deletions(-)

diff --git a/toolchain/README.md b/toolchain/README.md
index 7c35a15d4a..279a2f8ef8 100644
--- a/toolchain/README.md
+++ b/toolchain/README.md
@@ -269,7 +269,35 @@ cmake -B $BUILD_DIR -DCMAKE_INSTALL_PREFIX=$PREFIX \
 
 Notice: You CANNOT use `icpx` compiler for GPU version of ABACUS for now, see discussion here [#2906](https://github.com/deepmodeling/abacus-develop/issues/2906) and [#4976](https://github.com/deepmodeling/abacus-develop/issues/4976)
 
-If you wants to use ABACUS GPU-LCAO by `cusolvermp` or `elpa`, please contact the coresponding developer, toolchain do not fully support them now.
+If you wants to use ABACUS GPU-LCAO by "cusolvermp" or "elpa", please compile according to the following usage:
+
+Firstly, in cmake, it is necessary to add `-DUSE_CUDA=ON`, which is necessary for compiling NVIDIA GPUs.
+1. For the elpa method, add
+```shell
+--enable-cuda
+--gpu-ver=GPU name
+export CUDA-PATH=/path/to/CUDA
+```
+to the `toolchain_gnu.sh`, and then follow the normal step to install the dependencies using `./toolchain_gnu.sh`.
+Afterwards, add the link files corresponding to
+```shell
+-DUSE_ELPA=ON \
+-DELPA_LINK-NLIBRARIES=/path/to/lib
+```
+in the `build_abacus_gnu.sh` file, just build the abacus executable program by compiling it with `./build_abacus_gnu.sh`.
+
+2. For the cusolvermp method, toolchain_gnu.sh does not need to be changed, just follow it directly install dependencies using `./toolchain_gnu.sh`, and then add
+```shell
+-DUSE_CUSOLVERMP=ON \
+-D CAL_CUSOLVERMP_PATH=/path/to/math.libs/1x.x/target/x86_64-linux/lib
+```
+to the `build.abacus_gnu.sh` file. At the same time, add the following three items to the environment:
+```shell
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/path/to/comm_libs/1x.x/hpcx/hpcx-x.xx/ucc/lib
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/path/to/comm_libs/1x.x/hpcx/hpcx-x.xx/ucx/lib
+export CPATH=$CPATH:/path/to/math_libs/1x.x/targets/x86_64-linux/include
+```
+Just enough to build the abacus executable program by compiling it with `./build_abacus_gnu.sh`.
 
 ### Shell problem
 
@@ -325,4 +353,4 @@ of each packages, which may let the installation more fiexible.
 
 ## More
 
-More infomation can be read from `Details.md`.
\ No newline at end of file
+More infomation can be read from `Details.md`.

From 9bd3ffd699139ce68dfe76b6a73269f5a88d7176 Mon Sep 17 00:00:00 2001
From: tang070205 <tang070205@proton.me>
Date: Tue, 18 Mar 2025 15:23:59 +0800
Subject: [PATCH 12/19] Add the use of 'cusolvermp' or 'elpa' methods to
 compile ABACUS GPU-LCAO

---
 toolchain/README.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/toolchain/README.md b/toolchain/README.md
index 279a2f8ef8..fbf87daf46 100644
--- a/toolchain/README.md
+++ b/toolchain/README.md
@@ -299,6 +299,10 @@ export CPATH=$CPATH:/path/to/math_libs/1x.x/targets/x86_64-linux/include
 ```
 Just enough to build the abacus executable program by compiling it with `./build_abacus_gnu.sh`.
 
+You can refer to the link video for auxiliary compilation and installation.
+The first one is more complicated, but it doesn't seem to be affected by the CUDA toolkits version and needs to be manually downloaded. Among them, CUSOLPERMP requires installation from sources such as apt or yum, which is suitable for containers or local computers.
+The second one is relatively simple, using NVIDIA HPC_SDK for installation, but requires CUDA toolkits 12.4 and above, suitable for any environment, recommended for use.
+
 ### Shell problem
 
 If you encounter problem like:

From c18551c6f0bf4f9efe19e49f01312f4187b7663d Mon Sep 17 00:00:00 2001
From: tang070205 <tang070205@proton.me>
Date: Tue, 18 Mar 2025 15:26:10 +0800
Subject: [PATCH 13/19] Add the use of 'cusolvermp' or 'elpa' methods to
 compile ABACUS GPU-LCAO

---
 toolchain/README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/toolchain/README.md b/toolchain/README.md
index fbf87daf46..caddb03b3f 100644
--- a/toolchain/README.md
+++ b/toolchain/README.md
@@ -303,6 +303,8 @@ You can refer to the link video for auxiliary compilation and installation.
 The first one is more complicated, but it doesn't seem to be affected by the CUDA toolkits version and needs to be manually downloaded. Among them, CUSOLPERMP requires installation from sources such as apt or yum, which is suitable for containers or local computers.
 The second one is relatively simple, using NVIDIA HPC_SDK for installation, but requires CUDA toolkits 12.4 and above, suitable for any environment, recommended for use.
 
+https://www.bilibili.com/video/BV1eqr5YuETN/
+
 ### Shell problem
 
 If you encounter problem like:

From c1f832ce102a41aac6544de3473d70ac7268a503 Mon Sep 17 00:00:00 2001
From: JamesMisaka <ff6757442@gmail.com>
Date: Wed, 19 Mar 2025 17:16:25 +0800
Subject: [PATCH 14/19] Add modification - ELPA compiler flags modification -
 GPU_VER setting modification: user should specify the GPU compability number,
 but not the GPU name - Modify toolchain_[gnu,intel].sh and
 build_abacus_[gnu,intel].sh to use the above modification

---
 toolchain/README.md                         | 125 +++++++++++---------
 toolchain/build_abacus_gnu.sh               |  10 +-
 toolchain/build_abacus_intel.sh             |   6 +-
 toolchain/install_abacus_toolchain.sh       |  78 +++---------
 toolchain/scripts/stage1/install_openmpi.sh |   1 -
 toolchain/scripts/stage3/install_elpa.sh    |  19 +--
 toolchain/scripts/tool_kit.sh               |   3 +-
 toolchain/toolchain_gnu.sh                  |  13 +-
 toolchain/toolchain_intel.sh                |  10 +-
 9 files changed, 125 insertions(+), 140 deletions(-)

diff --git a/toolchain/README.md b/toolchain/README.md
index caddb03b3f..702c71d3dc 100644
--- a/toolchain/README.md
+++ b/toolchain/README.md
@@ -2,7 +2,7 @@
 
 Version 2025.1
 
-## Author
+## Main Developer
 
 [QuantumMisaka](https://github.com/QuantumMisaka) 
 (Zhaoqing Liu) @PKU @AISI
@@ -26,8 +26,9 @@ and give setup files that you can use to compile ABACUS.
 - [x] Automatic installation of [CEREAL](https://github.com/USCiLab/cereal) and [LIBNPY](https://github.com/llohse/libnpy) (by github.com)
 - [x] Support for [LibRI](https://github.com/abacusmodeling/LibRI) by submodule or automatic installation from github.com (but installed LibRI via `wget` seems to have some problem, please be cautious)
 - [x] A mirror station by Bohrium database, which can download CEREAL, LibNPY, LibRI and LibComm by `wget` in China Internet. 
-- [x] Support for GPU compilation, users can add `-DUSE_CUDA=1` in builder scripts.
+- [x] Support for GPU-PW and GPU-LCAO compilation (elpa, cusolvermp is developed), and `-DUSE_CUDA=1` is needed builder scripts.
 - [x] Support for AMD compiler and math lib  `AOCL` and `AOCC` (not fully complete due to flang and AOCC-ABACUS compliation error)
+- [ ] Support for more GPU device out of Nvidia.
 - [ ] Change the downloading url from cp2k mirror to other mirror or directly downloading from official website. (doing)
 - [ ] Support a JSON or YAML configuration file for toolchain, which can be easily modified by users.
 - [ ] A better README and Detail markdown file.
@@ -138,7 +139,9 @@ Dependencies below are optional， which is NOT installed by default:
 - `LibComm` 0.1.1
 
 Users can install them by using `--with-*=install` in toolchain*.sh, which is `no` in default. Also, user can specify the absolute path of the package by `--with-*=path/to/package` in toolchain*.sh to allow toolchain to use the package.
-> Notice: LibRI, LibComm and Libnpy is on actively development, you should check-out the package version when using this toolchain. Also, LibRI and LibComm can be installed by github submodule, that is also work for libnpy, which is more recommended.
+> Notice: LibTorch always suffer from GLIBC_VERSION problem, if you encounter this, please downgrade LibTorch version to 1.12.1 in scripts/stage4/install_torch.sh
+> 
+> Notice: LibRI, LibComm, Rapidjson and Libnpy is on actively development, you should check-out the package version when using this toolchain. 
 
 Users can easily compile and install dependencies of ABACUS
 by running these scripts after loading `gcc` or `intel-mkl-mpi`
@@ -187,6 +190,69 @@ or you can also do it in a more completely way:
 > rm -rf install build/*/* build/OpenBLAS*/ build/setup_*
 ```
 
+## GPU version of ABACUS
+
+Toolchain supports compiling GPU version of ABACUS with Nvidia-GPU and CUDA. For usage, adding following options in build*.sh:
+
+```shell
+# in build_abacus_gnu.sh
+cmake -B $BUILD_DIR -DCMAKE_INSTALL_PREFIX=$PREFIX \
+        -DCMAKE_CXX_COMPILER=g++ \
+        -DMPI_CXX_COMPILER=mpicxx \
+        ......
+        -DUSE_CUDA=ON \
+        # -DCMAKE_CUDA_COMPILER=${path to cuda toolkit}/bin/nvcc \ # add if needed
+        ......
+# in build_abacus_intel.sh
+cmake -B $BUILD_DIR -DCMAKE_INSTALL_PREFIX=$PREFIX \
+        -DCMAKE_CXX_COMPILER=icpc \
+        -DMPI_CXX_COMPILER=mpiicpc \
+        ......
+        -DUSE_CUDA=ON \
+        # -DCMAKE_CUDA_COMPILER=${path to cuda toolkit}/bin/nvcc \ # add if needed
+        ......
+```
+which will enable GPU version of ABACUS, can be directly used for PW calculation.
+
+Notice: You CANNOT use `icpx` compiler for GPU version of ABACUS for now, see discussion here [#2906](https://github.com/deepmodeling/abacus-develop/issues/2906) and [#4976](https://github.com/deepmodeling/abacus-develop/issues/4976)
+
+If you wants to use ABACUS GPU-LCAO by "cusolvermp" or "elpa", please compile according to the following usage:
+
+1. For the elpa method, add
+```shell
+--enable-cuda
+--gpu-ver=
+export CUDA-PATH=/path/to/CUDA
+```
+to the `toolchain_gnu.sh`, and then follow the normal step to install the dependencies using `./toolchain_gnu.sh`.
+Afterwards, make sure these option are enable in your `build_abacus_*.sh` script 
+```shell
+-DUSE_ELPA=ON \
+-DUSE_CUDA=ON \
+```
+then just build the abacus executable program by compiling it with `./build_abacus_*.sh`.
+
+1. For the cusolvermp method, toolchain_gnu.sh does not need to be changed, just follow it directly install dependencies using `./toolchain_*.sh`, and then add
+```shell
+-DUSE_CUDA=ON \
+-DUSE_CUSOLVERMP=ON \
+-D CAL_CUSOLVERMP_PATH=/path/to/math.libs/1x.x/target/x86_64-linux/lib
+```
+to the `build.abacus_*.sh` file. add the following three items to the environment (assuming you are using hpcsdk):
+```shell
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/path/to/comm_libs/1x.x/hpcx/hpcx-x.xx/ucc/lib
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/path/to/comm_libs/1x.x/hpcx/hpcx-x.xx/ucx/lib
+export CPATH=$CPATH:/path/to/math_libs/1x.x/targets/x86_64-linux/include
+```
+Just enough to build the abacus executable program by compiling it with `./build_abacus_*.sh`.
+
+You can refer to the linking video for auxiliary compilation and installation. [Bilibili](https://www.bilibili.com/video/BV1eqr5YuETN/).
+The first one is more complicated, but it doesn't seem to be affected by the CUDA toolkits version and needs to be manually downloaded. Among them, CUSOLPERMP requires installation from sources such as apt or yum, which is suitable for containers or local computers.
+The second one is relatively simple, using NVIDIA HPC_SDK for installation, but requires CUDA toolkits 12.4 and above, which is suitable for any environment and recommended for usage.
+
+After compiling, you can specify `device GPU` in INPUT file to use GPU version of ABACUS.
+
+
 ## Common Problems and Solutions
 
 ### Intel-oneAPI problem
@@ -215,7 +281,7 @@ wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/0722521a-34b5-4
 
 Related discussion here [#4976](https://github.com/deepmodeling/abacus-develop/issues/4976)
 
-#### link problem in early 2023 version oneAPI
+#### linking problem in early 2023 version oneAPI
 
 Sometimes Intel-oneAPI have problem to link `mpirun`, 
 which will always show in 2023.2.0 version of MPI in Intel-oneAPI. 
@@ -253,57 +319,6 @@ git clone https://github.com/abacusmodeling/LibComm
 
 OpenMPI in version 5 has huge update, lead to compatibility problem. If one wants to use the OpenMPI in version 4 (4.1.6), one can specify `--with-openmpi-4th=yes` in *toolchain_gnu.sh*
 
-### GPU version of ABACUS
-
-For GPU version of ABACUS (do not GPU version installer of ELPA, which is still doing work), add following options in build*.sh:
-
-```shell
-cmake -B $BUILD_DIR -DCMAKE_INSTALL_PREFIX=$PREFIX \
-        -DCMAKE_CXX_COMPILER=icpx \
-        -DMPI_CXX_COMPILER=mpiicpc \
-        ......
-        -DUSE_CUDA=1 \
-        -DCMAKE_CUDA_COMPILER=${path to cuda toolkit}/bin/nvcc \
-        ......
-```
-
-Notice: You CANNOT use `icpx` compiler for GPU version of ABACUS for now, see discussion here [#2906](https://github.com/deepmodeling/abacus-develop/issues/2906) and [#4976](https://github.com/deepmodeling/abacus-develop/issues/4976)
-
-If you wants to use ABACUS GPU-LCAO by "cusolvermp" or "elpa", please compile according to the following usage:
-
-Firstly, in cmake, it is necessary to add `-DUSE_CUDA=ON`, which is necessary for compiling NVIDIA GPUs.
-1. For the elpa method, add
-```shell
---enable-cuda
---gpu-ver=GPU name
-export CUDA-PATH=/path/to/CUDA
-```
-to the `toolchain_gnu.sh`, and then follow the normal step to install the dependencies using `./toolchain_gnu.sh`.
-Afterwards, add the link files corresponding to
-```shell
--DUSE_ELPA=ON \
--DELPA_LINK-NLIBRARIES=/path/to/lib
-```
-in the `build_abacus_gnu.sh` file, just build the abacus executable program by compiling it with `./build_abacus_gnu.sh`.
-
-2. For the cusolvermp method, toolchain_gnu.sh does not need to be changed, just follow it directly install dependencies using `./toolchain_gnu.sh`, and then add
-```shell
--DUSE_CUSOLVERMP=ON \
--D CAL_CUSOLVERMP_PATH=/path/to/math.libs/1x.x/target/x86_64-linux/lib
-```
-to the `build.abacus_gnu.sh` file. At the same time, add the following three items to the environment:
-```shell
-export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/path/to/comm_libs/1x.x/hpcx/hpcx-x.xx/ucc/lib
-export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/path/to/comm_libs/1x.x/hpcx/hpcx-x.xx/ucx/lib
-export CPATH=$CPATH:/path/to/math_libs/1x.x/targets/x86_64-linux/include
-```
-Just enough to build the abacus executable program by compiling it with `./build_abacus_gnu.sh`.
-
-You can refer to the link video for auxiliary compilation and installation.
-The first one is more complicated, but it doesn't seem to be affected by the CUDA toolkits version and needs to be manually downloaded. Among them, CUSOLPERMP requires installation from sources such as apt or yum, which is suitable for containers or local computers.
-The second one is relatively simple, using NVIDIA HPC_SDK for installation, but requires CUDA toolkits 12.4 and above, suitable for any environment, recommended for use.
-
-https://www.bilibili.com/video/BV1eqr5YuETN/
 
 ### Shell problem
 
diff --git a/toolchain/build_abacus_gnu.sh b/toolchain/build_abacus_gnu.sh
index 7db6f2ddda..febe2fa5aa 100755
--- a/toolchain/build_abacus_gnu.sh
+++ b/toolchain/build_abacus_gnu.sh
@@ -24,7 +24,7 @@ PREFIX=$ABACUS_DIR
 LAPACK=$INSTALL_DIR/openblas-0.3.28/lib
 SCALAPACK=$INSTALL_DIR/scalapack-2.2.1/lib
 ELPA=$INSTALL_DIR/elpa-2025.01.001/cpu
-#ELPA_LINK=$TOOL/build/elpa-2025.01.001/build_nvidia
+# ELPA=$INSTALL_DIR/elpa-2025.01.001/nvidia # for gpu-lcao
 FFTW3=$INSTALL_DIR/fftw-3.3.10
 CEREAL=$INSTALL_DIR/cereal-1.3.2/include/cereal
 LIBXC=$INSTALL_DIR/libxc-7.0.0
@@ -50,6 +50,7 @@ cmake -B $BUILD_DIR -DCMAKE_INSTALL_PREFIX=$PREFIX \
         -DUSE_ELPA=ON \
         -DENABLE_RAPIDJSON=ON \
         -DRapidJSON_DIR=$RAPIDJSON \
+#        -DUSE_CUDA=ON \
 #         -DENABLE_DEEPKS=1 \
 #         -DTorch_DIR=$LIBTORCH \
 #         -Dlibnpy_INCLUDE_DIR=$LIBNPY \
@@ -57,11 +58,8 @@ cmake -B $BUILD_DIR -DCMAKE_INSTALL_PREFIX=$PREFIX \
 #         -DLIBRI_DIR=$LIBRI \
 #         -DLIBCOMM_DIR=$LIBCOMM \
 # 	      -DDeePMD_DIR=$DEEPMD \
-# 	      -DUSE_CUDA=ON \
-#         -DENABLE_CUSOLVERMP=ON \
-#         -D CAL_CUSOLVERMP_PATH=/opt/nvidia/hpc_sdk/Linux_x86_64/2x.xx/math_libs/1x.x/targets/x86_64-linux/lib
-#         -DUSE_ELPA=ON \
-#         -DELPA_LINK_LIBRARIES=$ELPA_LINK/.libs/libelpa_openmp.so \
+        #-DENABLE_CUSOLVERMP=ON \
+        #-D CAL_CUSOLVERMP_PATH=/opt/nvidia/hpc_sdk/Linux_x86_64/2x.xx/math_libs/1x.x/targets/x86_64-linux/lib
 
 # # add mkl env for libtorch to link
 # if one want to install libtorch, mkl should be load in build process
diff --git a/toolchain/build_abacus_intel.sh b/toolchain/build_abacus_intel.sh
index a2ef7dd8b0..5fc96a26b8 100755
--- a/toolchain/build_abacus_intel.sh
+++ b/toolchain/build_abacus_intel.sh
@@ -23,6 +23,7 @@ rm -rf $BUILD_DIR
 
 PREFIX=$ABACUS_DIR
 ELPA=$INSTALL_DIR/elpa-2025.01.001/cpu
+# ELPA=$INSTALL_DIR/elpa-2025.01.001/nvidia # for gpu-lcao
 CEREAL=$INSTALL_DIR/cereal-1.3.2/include/cereal
 LIBXC=$INSTALL_DIR/libxc-7.0.0
 RAPIDJSON=$INSTALL_DIR/rapidjson-1.1.0/
@@ -32,7 +33,7 @@ RAPIDJSON=$INSTALL_DIR/rapidjson-1.1.0/
 # LIBCOMM=$INSTALL_DIR/LibComm-0.1.1
 # DEEPMD=$HOME/apps/anaconda3/envs/deepmd # v3.0 might have problem
 
-# if use deepks and deepmd
+# Notice: if you are compiling with AMD-CPU or GPU-version ABACUS, then `icpc` and `mpiicpc` compilers are recommended 
 cmake -B $BUILD_DIR -DCMAKE_INSTALL_PREFIX=$PREFIX \
         -DCMAKE_CXX_COMPILER=icpx \
         -DMPI_CXX_COMPILER=mpiicpx \
@@ -46,6 +47,7 @@ cmake -B $BUILD_DIR -DCMAKE_INSTALL_PREFIX=$PREFIX \
         -DUSE_ELPA=ON \
         -DENABLE_RAPIDJSON=ON \
         -DRapidJSON_DIR=$RAPIDJSON \
+#         -DUSE_CUDA=ON \
 #         -DENABLE_DEEPKS=1 \
 #         -DTorch_DIR=$LIBTORCH \
 #         -Dlibnpy_INCLUDE_DIR=$LIBNPY \
@@ -74,4 +76,4 @@ Done!
 To use the installed ABACUS version
 You need to source ${TOOL}/abacus_env.sh first !
 """
-EOF
\ No newline at end of file
+EOF
diff --git a/toolchain/install_abacus_toolchain.sh b/toolchain/install_abacus_toolchain.sh
index 8bd864a154..b84ac0af1a 100755
--- a/toolchain/install_abacus_toolchain.sh
+++ b/toolchain/install_abacus_toolchain.sh
@@ -328,7 +328,7 @@ export intel_classic="no"
 # and will lead to problem in force calculation
 # but icx is recommended by intel compiler
 # option: --with-intel-classic can change it to yes/no
-# JamesMisaka by 2023.08
+# QuantumMisaka by 2023.08
 export intelmpi_classic="no"
 export with_ifx="yes" # whether ifx is used in oneapi
 export with_flang="no" # whether flang is used in aocc
@@ -397,7 +397,7 @@ while [ $# -ge 1 ]; do
           eval with_${ii}="__INSTALL__"
         fi
       done
-      # I'd like to use OpenMPI as default -- zhaoqing liu in 2023.09.17
+      # I'd like to use OpenMPI as default -- QuantumMisaka in 2023.09.17
       export MPI_MODE="openmpi"
       ;;
     --mpi-mode=*)
@@ -448,17 +448,7 @@ while [ $# -ge 1 ]; do
       ;;
     --gpu-ver=*)
       user_input="${1#*=}"
-      case "${user_input}" in
-        K20X | K40 | K80 | P100 | V100 | A100 | L40S | Mi50 | Mi100 | Mi250 | no)
-          export GPUVER="${user_input}"
-          export TARGET="nvidia"
-          ;;
-        *)
-          report_error ${LINENO} \
-            "--gpu-ver currently only supports K20X, K40, K80, P100, V100, A100, L40S, Mi50, Mi100, Mi250, and no as options"
-          exit 1
-          ;;
-      esac
+      export GPUVER="${user_input}"
       ;;
     --target-cpu=*)
       user_input="${1#*=}"
@@ -685,7 +675,7 @@ else
   esac
 fi
 # If MATH_MODE is mkl ,then openblas, scalapack and fftw is not needed
-# zhaoqing in 2023-09-17
+# QuantumMisaka in 2023-09-17
 if [ "${MATH_MODE}" = "mkl" ]; then
   if [ "${with_openblas}" != "__DONTUSE__" ]; then
     echo "Using MKL, so openblas is disabled."
@@ -701,6 +691,17 @@ if [ "${MATH_MODE}" = "mkl" ]; then
   fi
 fi
 
+# Select the correct compute number based on the GPU architecture
+# QuantumMisaka in 2025-03-19
+export ARCH_NUM="${GPUVER//.}"
+if [[ "$ARCH_NUM" =~ ^[1-9][0-9]*$ ]] || [ $ARCH_NUM = "no" ]; then
+    echo "Notice: GPU compilation is enabled, and GPU compatibility is set via --gpu-ver to sm_${ARCH_NUM}."
+else
+    report_error ${LINENO} \
+        "When GPU compilation is enabled, the --gpu-ver variable should be properly set regarding to GPU compatibility. For check your GPU compatibility, visit https://developer.nvidia.com/cuda-gpus. For example: A100 -> 8.0 (or 80), V100 -> 7.0 (or 70), 4090 -> 8.9 (or 89)"
+    exit 1
+fi
+
 # If CUDA or HIP are enabled, make sure the GPU version has been defined.
 if [ "${ENABLE_CUDA}" = "__TRUE__" ] || [ "${ENABLE_HIP}" = "__TRUE__" ]; then
   if [ "${GPUVER}" = "no" ]; then
@@ -709,9 +710,10 @@ if [ "${ENABLE_CUDA}" = "__TRUE__" ] || [ "${ENABLE_HIP}" = "__TRUE__" ]; then
   fi
 fi
 
-# several packages require cmake.
-if [ "${with_scalapack}" = "__INSTALL__" ]; then
-  [ "${with_cmake}" = "__DONTUSE__" ] && with_cmake="__INSTALL__"
+# ABACUS itself and some dependencies require cmake.
+if [ "${with_cmake}" = "__DONTUSE__" ]; then
+  report_error "CMake is required for ABACUS and some dependencies. Please enable it."
+  exit 1
 fi
 
 
@@ -817,48 +819,6 @@ fi
 
 echo "Compiling with $(get_nprocs) processes for target ${TARGET_CPU}."
 
-# Select the correct compute number based on the GPU architecture
-case ${GPUVER} in
-  K20X)
-    export ARCH_NUM="35"
-    ;;
-  K40)
-    export ARCH_NUM="35"
-    ;;
-  K80)
-    export ARCH_NUM="37"
-    ;;
-  P100)
-    export ARCH_NUM="60"
-    ;;
-  V100)
-    export ARCH_NUM="70"
-    ;;
-  A100)
-    export ARCH_NUM="80"
-    ;;
-  L40S)
-    export ARCH_NUM="89"
-    ;;
-  Mi50)
-    # TODO: export ARCH_NUM=
-    ;;
-  Mi100)
-    # TODO: export ARCH_NUM=
-    ;;
-  Mi250)
-    # TODO: export ARCH_NUM=
-    ;;
-  no)
-    export ARCH_NUM="no"
-    ;;
-  *)
-    report_error ${LINENO} \
-      "--gpu-ver currently only supports K20X, K40, K80, P100, V100, A100, Mi50, Mi100, Mi250, and no as options"
-    exit 1
-    ;;
-esac
-
 write_toolchain_env ${INSTALLDIR}
 
 # write toolchain config
diff --git a/toolchain/scripts/stage1/install_openmpi.sh b/toolchain/scripts/stage1/install_openmpi.sh
index ca8fccf7ca..ab65a89553 100755
--- a/toolchain/scripts/stage1/install_openmpi.sh
+++ b/toolchain/scripts/stage1/install_openmpi.sh
@@ -80,7 +80,6 @@ case "${with_openmpi}" in
         --prefix=${pkg_install_dir} \
         --libdir="${pkg_install_dir}/lib" \
         --with-libevent=internal \
-        --enable-mpi-fortran FC=gfortran \
         ${EXTRA_CONFIGURE_FLAGS} \
         > configure.log 2>&1 || tail -n ${LOG_LINES} configure.log
       make -j $(get_nprocs) > make.log 2>&1 || tail -n ${LOG_LINES} make.log
diff --git a/toolchain/scripts/stage3/install_elpa.sh b/toolchain/scripts/stage3/install_elpa.sh
index a46e7cae49..94cc3d1bb9 100755
--- a/toolchain/scripts/stage3/install_elpa.sh
+++ b/toolchain/scripts/stage3/install_elpa.sh
@@ -98,26 +98,27 @@ case "$with_elpa" in
           config_flags="--enable-avx-kernels=${has_AVX} --enable-avx2-kernels=${has_AVX2} --enable-avx512-kernels=${has_AVX512}"
         fi
       fi
+      # CUDA_CFLAGS="-std=c++14 -allow-unsupported-compiler" \
       for TARGET in "cpu" "nvidia"; do
         [ "$TARGET" = "nvidia" ] && [ "$ENABLE_CUDA" != "__TRUE__" ] && continue
+        # disable cpu if cuda is enabled
+        [ "$TARGET" != "nvidia" ] && [ "$ENABLE_CUDA" = "__TRUE__" ] && continue
         echo "Installing from scratch into ${pkg_install_dir}/${TARGET}"
-
         mkdir -p "build_${TARGET}"
         cd "build_${TARGET}"
-        if [ "${with_amd}" != "__DONTUSE__" ]; then
-        echo "AMD compiler detected, enable special option operation"
+        if [ "${with_amd}" != "__DONTUSE__" ] && [ "${with_flang}" = "yes" ] ; then
+        echo "AMD fortran compiler detected, enable special option operation"
         ../configure --prefix="${pkg_install_dir}/${TARGET}/" \
           --libdir="${pkg_install_dir}/${TARGET}/lib" \
           --enable-openmp=${enable_openmp} \
-          --enable-shared=yes \
           --enable-static=yes \
+          --enable-shared=yes \
           --disable-c-tests \
           --disable-cpp-tests \
           ${config_flags} \
           --enable-nvidia-gpu-kernels=$([ "$TARGET" = "nvidia" ] && echo "yes" || echo "no") \
           --with-cuda-path=${CUDA_PATH:-${CUDA_HOME:-/CUDA_HOME-notset}} \
-          --with-NVIDIA-GPU-compute-capability=$([ "$TARGET" = "nvidia" ] && echo "sm_$ARCH_NUM" || echo "sm_75") \
-          CUDA_CFLAGS="-std=c++14 -allow-unsupported-compiler" \
+          --with-NVIDIA-GPU-compute-capability=$([ "$TARGET" = "nvidia" ] && echo "sm_$ARCH_NUM" || echo "sm_70") \
           OMPI_MCA_plm_rsh_agent=/bin/false \
           FC=${MPIFC} \
           CC=${MPICC} \
@@ -138,10 +139,14 @@ case "$with_elpa" in
         ../configure --prefix="${pkg_install_dir}/${TARGET}/" \
           --libdir="${pkg_install_dir}/${TARGET}/lib" \
           --enable-openmp=${enable_openmp} \
+          --enable-static=yes \
+          --enable-shared=yes \
+          --disable-c-tests \
+          --disable-cpp-tests \
           ${config_flags} \
           --enable-nvidia-gpu-kernels=$([ "$TARGET" = "nvidia" ] && echo "yes" || echo "no") \
           --with-cuda-path=${CUDA_PATH:-${CUDA_HOME:-/CUDA_HOME-notset}} \
-          --with-NVIDIA-GPU-compute-capability=$([ "$TARGET" = "nvidia" ] && echo "sm_$ARCH_NUM" || echo "sm_75") \
+          --with-NVIDIA-GPU-compute-capability=$([ "$TARGET" = "nvidia" ] && echo "sm_$ARCH_NUM" || echo "sm_70") \
           FC=${MPIFC} \
           CC=${MPICC} \
           CXX=${MPICXX} \
diff --git a/toolchain/scripts/tool_kit.sh b/toolchain/scripts/tool_kit.sh
index 8ecfc9decb..e03c125c3b 100755
--- a/toolchain/scripts/tool_kit.sh
+++ b/toolchain/scripts/tool_kit.sh
@@ -369,12 +369,13 @@ check_command() {
 }
 
 # check if directory exists
+# add more error msg by QuantumMisaka in 2025.03.19
 check_dir() {
   local __dir=$1
   if [ -d "$__dir" ]; then
     echo "Found directory $__dir"
   else
-    report_error "Cannot find $__dir"
+    report_error "Cannot find $__dir, please check your --with-PKG input to march options: [system|install|no|(path/to/pkg)]"
     return 1
   fi
 }
diff --git a/toolchain/toolchain_gnu.sh b/toolchain/toolchain_gnu.sh
index 32bfe31e9f..929d2b5fb1 100755
--- a/toolchain/toolchain_gnu.sh
+++ b/toolchain/toolchain_gnu.sh
@@ -12,13 +12,15 @@
 # libtorch and libnpy are for deepks support, which can be =no
 # if you want to run EXX calculation, you should set --with-libri=install
 # mpich (and intel toolchain) is recommended for EXX support
-#export CUDA_PATH=/usr/local/cuda
+# gpu-lcao supporting modify: CUDA_PATH and --enable-cuda
+# export CUDA_PATH=/usr/local/cuda
+
 ./install_abacus_toolchain.sh \
 --with-gcc=system \
 --with-intel=no \
 --with-openblas=install \
 --with-openmpi=install \
---with-cmake=install \
+--with-cmake=system \
 --with-scalapack=install \
 --with-libxc=install \
 --with-fftw=install \
@@ -30,7 +32,8 @@
 --with-libri=no \
 --with-libcomm=no \
 --with-4th-openmpi=no \
-#--enable-cuda \
-#--gpu-ver=L40S \
+--enable-cuda \
+--gpu-ver=75 \
 | tee compile.log
-# if you want to use openmpi-version4: set --with-4th-openmpi=yes
+# to use openmpi-version4: set --with-4th-openmpi=yes
+# to enable gpu-lcao, add the following lines:
diff --git a/toolchain/toolchain_intel.sh b/toolchain/toolchain_intel.sh
index d12afc919d..536e53c85b 100755
--- a/toolchain/toolchain_intel.sh
+++ b/toolchain/toolchain_intel.sh
@@ -12,13 +12,13 @@
 # libtorch and libnpy are for deepks support, which can be =no
 
 # module load mkl mpi compiler
-
+export CUDA_PATH=/usr/local/cuda
 ./install_abacus_toolchain.sh \
 --with-intel=system \
 --math-mode=mkl \
 --with-gcc=no \
 --with-intelmpi=system \
---with-cmake=install \
+--with-cmake=system \
 --with-scalapack=no \
 --with-libxc=install \
 --with-fftw=no \
@@ -29,6 +29,8 @@
 --with-libnpy=no \
 --with-libri=no \
 --with-libcomm=no \
---with-intel-classic=no \
+--with-intel-classic=yes \
+--enable-cuda \
+--gpu-ver=L40S \
 | tee compile.log
-# if you are using AMD server: set --with-intel-classic=yes
\ No newline at end of file
+# if you are using AMD server: set --with-intel-classic=yes

From 2aea32b7a8909377b4508b8b15bdd19c623ddcec Mon Sep 17 00:00:00 2001
From: JamesMisaka <ff6757442@gmail.com>
Date: Wed, 19 Mar 2025 17:24:51 +0800
Subject: [PATCH 15/19] minor adjustment

---
 toolchain/README.md                | 10 ++++++----
 toolchain/build_abacus_gnu-aocl.sh |  2 +-
 toolchain/toolchain_amd.sh         |  2 +-
 toolchain/toolchain_gnu.sh         |  6 ++++--
 toolchain/toolchain_intel-mpich.sh |  2 +-
 toolchain/toolchain_intel.sh       | 11 +++++++----
 6 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/toolchain/README.md b/toolchain/README.md
index 702c71d3dc..f43a1eb8d9 100644
--- a/toolchain/README.md
+++ b/toolchain/README.md
@@ -220,11 +220,13 @@ If you wants to use ABACUS GPU-LCAO by "cusolvermp" or "elpa", please compile ac
 
 1. For the elpa method, add
 ```shell
---enable-cuda
---gpu-ver=
 export CUDA-PATH=/path/to/CUDA
+# install_abacus_toolchain.sh part options
+--enable-cuda \
+--gpu-ver=(GPU-compability-number) \
 ```
-to the `toolchain_gnu.sh`, and then follow the normal step to install the dependencies using `./toolchain_gnu.sh`.
+to the `toolchain_*.sh`, and then follow the normal step to install the dependencies using `./toolchain_*.sh`. For checking the GPU compatibility number, you can refer to the [CUDA compatibility](https://developer.nvidia.com/cuda-gpus).
+
 Afterwards, make sure these option are enable in your `build_abacus_*.sh` script 
 ```shell
 -DUSE_ELPA=ON \
@@ -236,7 +238,7 @@ then just build the abacus executable program by compiling it with `./build_abac
 ```shell
 -DUSE_CUDA=ON \
 -DUSE_CUSOLVERMP=ON \
--D CAL_CUSOLVERMP_PATH=/path/to/math.libs/1x.x/target/x86_64-linux/lib
+-D CAL_CUSOLVERMP_PATH=/path/to/math.libs/1x.x/target/x86_64-linux/lib \
 ```
 to the `build.abacus_*.sh` file. add the following three items to the environment (assuming you are using hpcsdk):
 ```shell
diff --git a/toolchain/build_abacus_gnu-aocl.sh b/toolchain/build_abacus_gnu-aocl.sh
index 3ab0ce97fd..ab283efb3b 100755
--- a/toolchain/build_abacus_gnu-aocl.sh
+++ b/toolchain/build_abacus_gnu-aocl.sh
@@ -18,7 +18,7 @@ cd $ABACUS_DIR
 ABACUS_DIR=$(pwd)
 #AOCLhome=/opt/aocl  # user can specify this parameter
 
-BUILD_DIR=build_abacus_gnu
+BUILD_DIR=build_abacus_aocl
 rm -rf $BUILD_DIR
 
 PREFIX=$ABACUS_DIR
diff --git a/toolchain/toolchain_amd.sh b/toolchain/toolchain_amd.sh
index b8055176c6..797f7c67ea 100755
--- a/toolchain/toolchain_amd.sh
+++ b/toolchain/toolchain_amd.sh
@@ -33,5 +33,5 @@
 --with-4th-openmpi=no \
 --with-flang=no \
 | tee compile.log
-# if you want to use openmpi-version4: set --with-4th-openmpi=yes
+# to use openmpi-version4: set --with-4th-openmpi=yes
 # flang is not recommended to use in this stage
\ No newline at end of file
diff --git a/toolchain/toolchain_gnu.sh b/toolchain/toolchain_gnu.sh
index 929d2b5fb1..9b43b50bf0 100755
--- a/toolchain/toolchain_gnu.sh
+++ b/toolchain/toolchain_gnu.sh
@@ -32,8 +32,10 @@
 --with-libri=no \
 --with-libcomm=no \
 --with-4th-openmpi=no \
---enable-cuda \
---gpu-ver=75 \
 | tee compile.log
 # to use openmpi-version4: set --with-4th-openmpi=yes
 # to enable gpu-lcao, add the following lines:
+# --enable-cuda \
+# --gpu-ver=75 \ 
+# one should check your gpu compute capability number 
+# and use it in --gpu-ver
diff --git a/toolchain/toolchain_intel-mpich.sh b/toolchain/toolchain_intel-mpich.sh
index 1f50679f1a..afa9871762 100755
--- a/toolchain/toolchain_intel-mpich.sh
+++ b/toolchain/toolchain_intel-mpich.sh
@@ -30,4 +30,4 @@
 --with-libcomm=no \
 --with-intel-classic=no \
 | tee compile.log
-# if you are using AMD server: set --with-intel-classic=yes
\ No newline at end of file
+# for using AMD-CPU or GPU-version: set --with-intel-classic=yes
\ No newline at end of file
diff --git a/toolchain/toolchain_intel.sh b/toolchain/toolchain_intel.sh
index 536e53c85b..2e0e771040 100755
--- a/toolchain/toolchain_intel.sh
+++ b/toolchain/toolchain_intel.sh
@@ -29,8 +29,11 @@ export CUDA_PATH=/usr/local/cuda
 --with-libnpy=no \
 --with-libri=no \
 --with-libcomm=no \
---with-intel-classic=yes \
---enable-cuda \
---gpu-ver=L40S \
+--with-intel-classic=no \
 | tee compile.log
-# if you are using AMD server: set --with-intel-classic=yes
+# for using AMD-CPU or GPU-version: set --with-intel-classic=yes
+# to enable gpu-lcao, add the following lines:
+# --enable-cuda \
+# --gpu-ver=75 \ 
+# one should check your gpu compute capability number 
+# and use it in --gpu-ver
\ No newline at end of file

From 539f59331be548e536f47a80632f7fb534f33ece Mon Sep 17 00:00:00 2001
From: JamesMisaka <ff6757442@gmail.com>
Date: Wed, 19 Mar 2025 17:32:41 +0800
Subject: [PATCH 16/19] update README

---
 toolchain/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/toolchain/README.md b/toolchain/README.md
index f43a1eb8d9..0b52ec2def 100644
--- a/toolchain/README.md
+++ b/toolchain/README.md
@@ -26,7 +26,7 @@ and give setup files that you can use to compile ABACUS.
 - [x] Automatic installation of [CEREAL](https://github.com/USCiLab/cereal) and [LIBNPY](https://github.com/llohse/libnpy) (by github.com)
 - [x] Support for [LibRI](https://github.com/abacusmodeling/LibRI) by submodule or automatic installation from github.com (but installed LibRI via `wget` seems to have some problem, please be cautious)
 - [x] A mirror station by Bohrium database, which can download CEREAL, LibNPY, LibRI and LibComm by `wget` in China Internet. 
-- [x] Support for GPU-PW and GPU-LCAO compilation (elpa, cusolvermp is developed), and `-DUSE_CUDA=1` is needed builder scripts.
+- [x] Support for GPU-PW and GPU-LCAO compilation (elpa, cusolvermp is developing), and `-DUSE_CUDA=1` is needed builder scripts.
 - [x] Support for AMD compiler and math lib  `AOCL` and `AOCC` (not fully complete due to flang and AOCC-ABACUS compliation error)
 - [ ] Support for more GPU device out of Nvidia.
 - [ ] Change the downloading url from cp2k mirror to other mirror or directly downloading from official website. (doing)

From eb0ab109ec003f736a5f808457f1eb224cd9891f Mon Sep 17 00:00:00 2001
From: JamesMisaka <ff6757442@gmail.com>
Date: Wed, 19 Mar 2025 17:36:25 +0800
Subject: [PATCH 17/19] give back cmake default option

---
 toolchain/toolchain_gnu.sh   | 2 +-
 toolchain/toolchain_intel.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/toolchain/toolchain_gnu.sh b/toolchain/toolchain_gnu.sh
index 9b43b50bf0..bea1ee0793 100755
--- a/toolchain/toolchain_gnu.sh
+++ b/toolchain/toolchain_gnu.sh
@@ -20,7 +20,7 @@
 --with-intel=no \
 --with-openblas=install \
 --with-openmpi=install \
---with-cmake=system \
+--with-cmake=install \
 --with-scalapack=install \
 --with-libxc=install \
 --with-fftw=install \
diff --git a/toolchain/toolchain_intel.sh b/toolchain/toolchain_intel.sh
index 2e0e771040..4af7236896 100755
--- a/toolchain/toolchain_intel.sh
+++ b/toolchain/toolchain_intel.sh
@@ -18,7 +18,7 @@ export CUDA_PATH=/usr/local/cuda
 --math-mode=mkl \
 --with-gcc=no \
 --with-intelmpi=system \
---with-cmake=system \
+--with-cmake=install \
 --with-scalapack=no \
 --with-libxc=install \
 --with-fftw=no \

From 13735bc2a753117ed60591e1fbd5161b48ae5de6 Mon Sep 17 00:00:00 2001
From: JamesMisaka <ff6757442@gmail.com>
Date: Thu, 20 Mar 2025 22:13:40 +0800
Subject: [PATCH 18/19] update README and cusolvermp

---
 toolchain/README.md | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/toolchain/README.md b/toolchain/README.md
index 0b52ec2def..2a28b41131 100644
--- a/toolchain/README.md
+++ b/toolchain/README.md
@@ -212,18 +212,18 @@ cmake -B $BUILD_DIR -DCMAKE_INSTALL_PREFIX=$PREFIX \
         # -DCMAKE_CUDA_COMPILER=${path to cuda toolkit}/bin/nvcc \ # add if needed
         ......
 ```
-which will enable GPU version of ABACUS, can be directly used for PW calculation.
+which will enable GPU version of ABACUS, and the `ks_solver cusolver` method can be directly used for PW and LCAO calculation.
 
 Notice: You CANNOT use `icpx` compiler for GPU version of ABACUS for now, see discussion here [#2906](https://github.com/deepmodeling/abacus-develop/issues/2906) and [#4976](https://github.com/deepmodeling/abacus-develop/issues/4976)
 
-If you wants to use ABACUS GPU-LCAO by "cusolvermp" or "elpa", please compile according to the following usage:
+If you wants to use ABACUS GPU-LCAO by `cusolvermp` or `elpa` for multiple-GPU calculation, please compile according to the following usage:
 
 1. For the elpa method, add
 ```shell
-export CUDA-PATH=/path/to/CUDA
+export CUDA_PATH=/path/to/CUDA
 # install_abacus_toolchain.sh part options
 --enable-cuda \
---gpu-ver=(GPU-compability-number) \
+--gpu-ver=(GPU-compatibility-number) \
 ```
 to the `toolchain_*.sh`, and then follow the normal step to install the dependencies using `./toolchain_*.sh`. For checking the GPU compatibility number, you can refer to the [CUDA compatibility](https://developer.nvidia.com/cuda-gpus).
 
@@ -234,7 +234,9 @@ Afterwards, make sure these option are enable in your `build_abacus_*.sh` script
 ```
 then just build the abacus executable program by compiling it with `./build_abacus_*.sh`.
 
-1. For the cusolvermp method, toolchain_gnu.sh does not need to be changed, just follow it directly install dependencies using `./toolchain_*.sh`, and then add
+The ELPA method need more parameter setting, but it doesn't seem to be affected by the CUDA toolkits version, and it is no need to manually install and package. 
+
+2. For the cusolvermp method, toolchain_*.sh does not need to be changed, just follow it directly install dependencies using `./toolchain_*.sh`, and then add
 ```shell
 -DUSE_CUDA=ON \
 -DUSE_CUSOLVERMP=ON \
@@ -249,8 +251,9 @@ export CPATH=$CPATH:/path/to/math_libs/1x.x/targets/x86_64-linux/include
 Just enough to build the abacus executable program by compiling it with `./build_abacus_*.sh`.
 
 You can refer to the linking video for auxiliary compilation and installation. [Bilibili](https://www.bilibili.com/video/BV1eqr5YuETN/).
-The first one is more complicated, but it doesn't seem to be affected by the CUDA toolkits version and needs to be manually downloaded. Among them, CUSOLPERMP requires installation from sources such as apt or yum, which is suitable for containers or local computers.
-The second one is relatively simple, using NVIDIA HPC_SDK for installation, but requires CUDA toolkits 12.4 and above, which is suitable for any environment and recommended for usage.
+
+The cusolverMP requires installation from sources such as apt or yum, which is suitable for containers or local computers.
+The second choice is using [NVIDIA HPC_SDK](https://developer.nvidia.com/hpc-sdk-downloads) for installation, which is relatively simple, but the package from NVIDIA HPC_SDK may not be suitable, especially for muitiple-GPU parallel running. To better use cusolvermp and its dependency (libcal, ucx, ucc) in multi-GPU running, please contact your server manager.
 
 After compiling, you can specify `device GPU` in INPUT file to use GPU version of ABACUS.
 

From 654f8ba6c8ef38e7d05acfbf49e6ae6a5995c7d5 Mon Sep 17 00:00:00 2001
From: tang070205 <tang070205@proton.me>
Date: Thu, 20 Mar 2025 22:29:57 +0800
Subject: [PATCH 19/19] Update README.md

---
 toolchain/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/toolchain/README.md b/toolchain/README.md
index 2a28b41131..d190ff1064 100644
--- a/toolchain/README.md
+++ b/toolchain/README.md
@@ -239,7 +239,7 @@ The ELPA method need more parameter setting, but it doesn't seem to be affected
 2. For the cusolvermp method, toolchain_*.sh does not need to be changed, just follow it directly install dependencies using `./toolchain_*.sh`, and then add
 ```shell
 -DUSE_CUDA=ON \
--DUSE_CUSOLVERMP=ON \
+-DENABLE_CUSOLVERMP=ON \
 -D CAL_CUSOLVERMP_PATH=/path/to/math.libs/1x.x/target/x86_64-linux/lib \
 ```
 to the `build.abacus_*.sh` file. add the following three items to the environment (assuming you are using hpcsdk):