add test cuda workflow (#2848)

njzjz · web-flow · commit 544875ed570c · 2023-09-21T12:57:06.000+08:00
Signed-off-by: Jinzhe Zeng &lt;jinzhe.zeng@rutgers.edu&gt;
diff --git a/.github/workflows/remove_test_cuda_label.yml b/.github/workflows/remove_test_cuda_label.yml
@@ -0,0 +1,18 @@
+on:
+  pull_request_target:
+    types:
+      - "labeled"
+name: Test CUDA
+jobs:
+  remove_label:
+    permissions:
+      contents: read
+      pull-requests: write
+    # so one can re-trigger the workflow without manually removing the label
+    runs-on: ubuntu-latest
+    if: github.repository_owner == 'deepmodeling' && github.event.label.name == 'Test CUDA'
+    steps:
+    - uses: actions-ecosystem/action-remove-labels@v1
+      with:
+        labels: Test CUDA
+        number: ${{ github.event.pull_request.number }}
diff --git a/.github/workflows/test_cuda.yml b/.github/workflows/test_cuda.yml
@@ -0,0 +1,60 @@
+on:
+  # manually trigger
+  workflow_dispatch:
+  pull_request:
+    types:
+      - "labeled"
+name: Test CUDA
+jobs:
+  test_cuda:
+    name: Test Python and C++ on CUDA
+    runs-on: nvidia
+    if: github.repository_owner == 'deepmodeling' && github.event.label.name == 'Test CUDA' || github.event_name == 'workflow_dispatch'
+    steps:
+    - uses: actions/checkout@v4
+    - uses: actions/setup-python@v4
+      with:
+        python-version: '3.11'
+        cache: 'pip'
+    - name: Setup MPI
+      uses: mpi4py/setup-mpi@v1
+      with:
+        mpi: mpich
+    - uses: lukka/get-cmake@latest
+    - run: |
+         wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb \
+         && sudo dpkg -i cuda-keyring_1.0-1_all.deb \
+         && sudo apt-get update \
+         && sudo apt-get -y install cuda-11-8 libcudnn8=8.9.5.*-1+cuda11.8
+    - run: python -m pip install -U "pip>=21.3.1,!=23.0.0"
+    - run: pip install -v -e .[gpu,test,lmp,cu11] "ase @ https://github.com/rosswhitfield/ase/archive/edd03571aff6944b77b4a4b055239f3c3e4eeb66.zip"
+      env:
+        DP_BUILD_TESTING: 1
+        DP_VARIANT: cuda
+        CUDA_PATH: /usr/local/cuda-11.8
+    - run: dp --version
+    - run: pytest -s --cov=deepmd --cov=deepmd_cli source/tests --durations=0
+    - run: source/install/test_cc_local.sh
+      env:
+        OMP_NUM_THREADS: 1
+        TF_INTRA_OP_PARALLELISM_THREADS: 1
+        TF_INTER_OP_PARALLELISM_THREADS: 1
+        LMP_CXX11_ABI_0: 1
+        CMAKE_GENERATOR: Ninja
+        DP_VARIANT: cuda
+        DP_USE_MPICH2: 1
+        CUDA_PATH: /usr/local/cuda-11.8
+    - run: |
+        export LD_LIBRARY_PATH=${{ github.workspace }}/dp_test/lib:$CUDA_PATH/lib64:$LD_LIBRARY_PATH
+        export PATH=${{ github.workspace }}/dp_test/bin:$PATH
+        pytest -s --cov=deepmd source/lmp/tests
+        pytest -s --cov=deepmd source/ipi/tests
+      env:
+        OMP_NUM_THREADS: 1
+        TF_INTRA_OP_PARALLELISM_THREADS: 1
+        TF_INTER_OP_PARALLELISM_THREADS: 1
+        LAMMPS_PLUGIN_PATH: ${{ github.workspace }}/dp_test/lib/deepmd_lmp
+        CUDA_PATH: /usr/local/cuda-11.8
+    - uses: codecov/codecov-action@v3
+      with:
+        gcov: true
diff --git a/doc/development/cicd.md b/doc/development/cicd.md
@@ -0,0 +1,15 @@
+# CI/CD
+
+<!-- TODO: To be written... -->
+
+## CI
+
+<!-- TODO: To be written... -->
+
+### Test CUDA
+
+`Test CUDA` action runs tests on a self-hosted runner with the NVIDIA card. It is not triggered by every PR. The developer who has the permission to manage the label can apply the label `Test CUDA` to a PR to trigger this action.
+
+<!-- ## CD -->
+
+<!-- TODO: To be written... -->
diff --git a/doc/index.rst b/doc/index.rst
@@ -52,7 +52,6 @@ DeePMD-kit is a package written in Python/C++, designed to minimize the effort r
 .. toctree::
    :maxdepth: 2
    :caption: Tutorial
-   :glob:
 
    Tutorials <https://tutorials.deepmodeling.com/>
    Publications <https://deepmodeling.com/blog/papers/deepmd-kit/>
@@ -62,9 +61,12 @@ DeePMD-kit is a package written in Python/C++, designed to minimize the effort r
 .. toctree::
    :maxdepth: 5
    :caption: Developer Guide
-   :glob:
 
-   development/*
+   development/cmake
+   development/create-a-model
+   development/type-embedding
+   development/coding-conventions
+   development/cicd
    api_py/api_py
    api_op
    API_CC/api_cc
diff --git a/source/install/test_cc.sh b/source/install/test_cc.sh
@@ -1,5 +1,11 @@
 set -e
 
+if [ "$DP_VARIANT" = "cuda" ]; then
+	CUDA_ARGS="-DUSE_CUDA_TOOLKIT=TRUE"
+elif [ "$DP_VARIANT" = "rocm" ]; then
+	CUDA_ARGS="-DUSE_ROCM_TOOLKIT=TRUE"
+fi
+
 #------------------
 
 SCRIPT_PATH=$(dirname $(realpath -s $0))
@@ -11,7 +17,7 @@ INSTALL_PREFIX=${SCRIPT_PATH}/../../dp_test
 BUILD_TMP_DIR=${SCRIPT_PATH}/../build_tests
 mkdir -p ${BUILD_TMP_DIR}
 cd ${BUILD_TMP_DIR}
-cmake -DINSTALL_TENSORFLOW=TRUE -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} -DTENSORFLOW_ROOT=${INSTALL_PREFIX} -DBUILD_TESTING:BOOL=TRUE -DLAMMPS_VERSION=stable_2Aug2023 ..
+cmake -DINSTALL_TENSORFLOW=TRUE -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} -DTENSORFLOW_ROOT=${INSTALL_PREFIX} -DBUILD_TESTING:BOOL=TRUE -DLAMMPS_VERSION=stable_2Aug2023 ${CUDA_ARGS} ..
 cmake --build . -j${NPROC}
 cmake --install .
 ctest --output-on-failure
diff --git a/source/install/test_cc_local.sh b/source/install/test_cc_local.sh
@@ -1,5 +1,11 @@
 set -e
 
+if [ "$DP_VARIANT" = "cuda" ]; then
+	CUDA_ARGS="-DUSE_CUDA_TOOLKIT=TRUE"
+elif [ "$DP_VARIANT" = "rocm" ]; then
+	CUDA_ARGS="-DUSE_ROCM_TOOLKIT=TRUE"
+fi
+
 #------------------
 
 SCRIPT_PATH=$(dirname $(realpath -s $0))
@@ -12,7 +18,7 @@ INSTALL_PREFIX=${SCRIPT_PATH}/../../dp_test
 BUILD_TMP_DIR=${SCRIPT_PATH}/../build_tests
 mkdir -p ${BUILD_TMP_DIR}
 cd ${BUILD_TMP_DIR}
-cmake -DINSTALL_TENSORFLOW=FALSE -DUSE_TF_PYTHON_LIBS=TRUE -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} -DBUILD_TESTING:BOOL=TRUE -DLAMMPS_VERSION=stable_2Aug2023 ..
+cmake -DINSTALL_TENSORFLOW=FALSE -DUSE_TF_PYTHON_LIBS=TRUE -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} -DBUILD_TESTING:BOOL=TRUE -DLAMMPS_VERSION=stable_2Aug2023 ${CUDA_ARGS} ..
 cmake --build . -j${NPROC}
 cmake --install .
 ctest --output-on-failure
diff --git a/source/lmp/plugin/CMakeLists.txt b/source/lmp/plugin/CMakeLists.txt
@@ -19,6 +19,11 @@ if(DEFINED LAMMPS_SOURCE_ROOT OR DEFINED LAMMPS_VERSION)
 
   target_include_directories(lammps_interface INTERFACE ${LAMMPS_HEADER_DIR})
 
+  if("$ENV{DP_USE_MPICH2}" STREQUAL "1")
+    # See https://stackoverflow.com/a/47976518/9567349
+    set(MPI_EXECUTABLE_SUFFIX ".mpich")
+  endif()
+
   find_package(MPI)
   if(MPI_FOUND)
     set(LAMMPS_MPI_INCLUDE_DIRS ${MPI_CXX_INCLUDE_DIRS})