Skip to content

Commit 452e572

Browse files
[LTS][develop][Toolchain] fix elpa-gpu installation problem in toolchain 202503 (deepmodeling#6631)
* fix(toolchain): fix elpa-gpu installation in legacy way * fix(toolchain): remove unsupported NEP interface option in LTS * feat(toolchain-elpa): add nvidia-cub and cusolver flag for elpa-gpu * fix(openblas): 修正openblas包名大小写不一致问题 * fix(toolchain): 修复wget下载时错误输出被重定向的问题
1 parent c870285 commit 452e572

File tree

8 files changed

+17
-12
lines changed

8 files changed

+17
-12
lines changed

toolchain/build_abacus_aocc-aocl.sh

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@ FFTW3=$AOCLhome
3333
LIBRI=$INSTALL_DIR/LibRI-master
3434
LIBCOMM=$INSTALL_DIR/LibComm-master
3535
USE_CUDA=OFF # set ON to enable gpu-abacus
36-
# NEP_DIR=$INSTALL_DIR/NEP_CPU-main
3736
# LIBTORCH=$INSTALL_DIR/libtorch-2.1.2/share/cmake/Torch
3837
# LIBNPY=$INSTALL_DIR/libnpy-1.0.1/include
3938
# DEEPMD=$HOME/apps/anaconda3/envs/deepmd
@@ -60,7 +59,6 @@ cmake -B $BUILD_DIR -DCMAKE_INSTALL_PREFIX=$PREFIX \
6059
-DLIBCOMM_DIR=$LIBCOMM \
6160
-DUSE_CUDA=$USE_CUDA \
6261
# -DCMAKE_CUDA_COMPILER=/path/to/cuda/bin/nvcc \
63-
# -DNEP_DIR=$NEP_DIR \
6462
# -DENABLE_DEEPKS=1 \
6563
# -DTorch_DIR=$LIBTORCH \
6664
# -Dlibnpy_INCLUDE_DIR=$LIBNPY \

toolchain/build_abacus_gcc-aocl.sh

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@ FFTW3=$AOCLhome
3333
LIBRI=$INSTALL_DIR/LibRI-master
3434
LIBCOMM=$INSTALL_DIR/LibComm-master
3535
USE_CUDA=OFF # set ON to enable gpu-abacus
36-
# NEP_DIR=$INSTALL_DIR/NEP_CPU-main
3736
# LIBTORCH=$INSTALL_DIR/libtorch-2.1.2/share/cmake/Torch
3837
# LIBNPY=$INSTALL_DIR/libnpy-1.0.1/include
3938
# DEEPMD=$HOME/apps/anaconda3/envs/deepmd
@@ -58,7 +57,6 @@ cmake -B $BUILD_DIR -DCMAKE_INSTALL_PREFIX=$PREFIX \
5857
-DLIBCOMM_DIR=$LIBCOMM \
5958
-DUSE_CUDA=$USE_CUDA \
6059
# -DCMAKE_CUDA_COMPILER=/path/to/cuda/bin/nvcc \
61-
# -DNEP_DIR=$NEP_DIR \
6260
# -DENABLE_DEEPKS=1 \
6361
# -DTorch_DIR=$LIBTORCH \
6462
# -Dlibnpy_INCLUDE_DIR=$LIBNPY \

toolchain/build_abacus_gnu.sh

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@ RAPIDJSON=$INSTALL_DIR/rapidjson-master/
3131
LIBRI=$INSTALL_DIR/LibRI-master
3232
LIBCOMM=$INSTALL_DIR/LibComm-master
3333
USE_CUDA=OFF # set ON to enable gpu-abacus
34-
# NEP_DIR=$INSTALL_DIR/NEP_CPU-main
3534
# LIBTORCH=$INSTALL_DIR/libtorch-2.1.2/share/cmake/Torch
3635
# LIBNPY=$INSTALL_DIR/libnpy-1.0.1/include
3736
# DEEPMD=$HOME/apps/anaconda3/envs/deepmd #
@@ -56,7 +55,6 @@ cmake -B $BUILD_DIR -DCMAKE_INSTALL_PREFIX=$PREFIX \
5655
-DLIBCOMM_DIR=$LIBCOMM \
5756
-DUSE_CUDA=$USE_CUDA \
5857
# -DCMAKE_CUDA_COMPILER=/path/to/cuda/bin/nvcc \
59-
# -DNEP_DIR=$NEP_DIR \
6058
# -DENABLE_DEEPKS=1 \
6159
# -DTorch_DIR=$LIBTORCH \
6260
# -Dlibnpy_INCLUDE_DIR=$LIBNPY \

toolchain/build_abacus_intel.sh

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@ RAPIDJSON=$INSTALL_DIR/rapidjson-master
3030
LIBRI=$INSTALL_DIR/LibRI-master
3131
LIBCOMM=$INSTALL_DIR/LibComm-master
3232
USE_CUDA=OFF # set ON to enable gpu-abacus
33-
# NEP_DIR=$INSTALL_DIR/NEP_CPU-main
3433
# LIBTORCH=$INSTALL_DIR/libtorch-2.1.2/share/cmake/Torch
3534
# LIBNPY=$INSTALL_DIR/libnpy-1.0.1/include
3635
# DEEPMD=$HOME/apps/anaconda3/envs/deepmd # v3.0 might have problem
@@ -54,7 +53,6 @@ cmake -B $BUILD_DIR -DCMAKE_INSTALL_PREFIX=$PREFIX \
5453
-DLIBCOMM_DIR=$LIBCOMM \
5554
-DUSE_CUDA=$USE_CUDA \
5655
# -DCMAKE_CUDA_COMPILER=/path/to/cuda/bin/nvcc \
57-
# -DNEP_DIR=$NEP_DIR \
5856
# -DENABLE_DEEPKS=1 \
5957
# -DTorch_DIR=$LIBTORCH \
6058
# -Dlibnpy_INCLUDE_DIR=$LIBNPY \

toolchain/scripts/lib/config_manager.sh

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -481,6 +481,11 @@ config_validate() {
481481
CONFIG_CACHE["ARCH_NUM"]="no"
482482
fi
483483

484+
# Backward compatibility: also export ARCH_NUM to environment when set
485+
if [[ -n "${CONFIG_CACHE[ARCH_NUM]}" ]]; then
486+
export ARCH_NUM="${CONFIG_CACHE[ARCH_NUM]}"
487+
fi
488+
484489
return 0
485490
}
486491

@@ -606,6 +611,10 @@ config_export_to_env() {
606611
for key in "${!CONFIG_CACHE[@]}"; do
607612
export "$key"="${CONFIG_CACHE[$key]}"
608613
done
614+
615+
# Backward compatibility for stage scripts expecting uppercase GPU flags
616+
# Installers (e.g., stage3/install_elpa.sh) read ENABLE_CUDA, not enable_cuda
617+
export ENABLE_CUDA="${CONFIG_CACHE[enable_cuda]}"
609618

610619
# Export package list variables
611620
export tool_list

toolchain/scripts/stage2/install_openblas.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ if [[ -z "$version_suffix" && -n "${ABACUS_TOOLCHAIN_VERSION_SUFFIX}" ]]; then
2828
fi
2929
# Load package variables with appropriate version
3030
load_package_vars "openblas" "$version_suffix"
31-
openblas_pkg="openblas-${openblas_ver}.tar.gz"
31+
openblas_pkg="OpenBLAS-${openblas_ver}.tar.gz"
3232

3333
source "${INSTALLDIR}"/toolchain.conf
3434
source "${INSTALLDIR}"/toolchain.env

toolchain/scripts/stage3/install_elpa.sh

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -114,9 +114,11 @@ case "$with_elpa" in
114114
fi
115115
fi
116116
for TARGET in "cpu" "nvidia"; do
117-
[ "$TARGET" = "nvidia" ] && [ "$ENABLE_CUDA" != "__TRUE__" ] && continue
117+
# Accept both uppercase and lowercase GPU enable flags for compatibility
118+
gpu_enabled="${ENABLE_CUDA:-${enable_cuda}}"
119+
[ "$TARGET" = "nvidia" ] && [ "$gpu_enabled" != "__TRUE__" ] && continue
118120
# disable cpu if cuda is enabled, only install one
119-
[ "$TARGET" != "nvidia" ] && [ "$ENABLE_CUDA" = "__TRUE__" ] && continue
121+
[ "$TARGET" != "nvidia" ] && [ "$gpu_enabled" = "__TRUE__" ] && continue
120122
# extend the pkg_install_dir by TARGET
121123
# this linking method is totally different from cp2k toolchain
122124
# for cp2k, ref https://github.com/cp2k/cp2k/commit/6fe2fc105b8cded84256248f68c74139dd8fc2e9
@@ -139,6 +141,7 @@ case "$with_elpa" in
139141
--with-cuda-path=${CUDA_PATH:-${CUDA_HOME:-/CUDA_HOME-notset}} \
140142
--enable-nvidia-gpu-kernels=$([ "$TARGET" = "nvidia" ] && echo "yes" || echo "no") \
141143
--with-NVIDIA-GPU-compute-capability=$([ "$TARGET" = "nvidia" ] && echo "sm_$ARCH_NUM" || echo "sm_70") \
144+
--enable-nvidia-cub --with-cusolver \
142145
OMPI_MCA_plm_rsh_agent=/bin/false \
143146
FC=${MPIFC} \
144147
CC=${MPICC} \
@@ -170,6 +173,7 @@ case "$with_elpa" in
170173
--enable-nvidia-gpu-kernels=$([ "$TARGET" = "nvidia" ] && echo "yes" || echo "no") \
171174
--with-cuda-path=${CUDA_PATH:-${CUDA_HOME:-/CUDA_HOME-notset}} \
172175
--with-NVIDIA-GPU-compute-capability=$([ "$TARGET" = "nvidia" ] && echo "sm_$ARCH_NUM" || echo "sm_70") \
176+
--enable-nvidia-cub --with-cusolver \
173177
FC=${MPIFC} \
174178
CC=${MPICC} \
175179
CXX=${MPICXX} \

toolchain/scripts/tool_kit.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -978,7 +978,7 @@ download_pkg_from_url() {
978978
"smart"|*)
979979
# Smart fallback: try with certificate validation first, then without
980980
echo "Attempting secure download: $__url"
981-
if wget ${DOWNLOADER_FLAGS} "$__url" -O "$__filename" 2>/dev/null; then
981+
if wget ${DOWNLOADER_FLAGS} "$__url" -O "$__filename"; then
982982
echo "Download successful with certificate validation"
983983
else
984984
echo "Certificate validation failed, retrying without certificate check..."

0 commit comments

Comments
 (0)