diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index d80564274e..f0ad600691 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -12,7 +12,7 @@ jobs: name: 'Core' runs-on: ubuntu-latest container: - image: nvcr.io/nvidia/cuda:12.1.0-devel-ubuntu22.04 + image: nvcr.io/nvidia/cuda:13.0.0-devel-ubuntu22.04 options: --user root steps: - name: 'Dependencies' @@ -20,6 +20,7 @@ jobs: apt-get update apt-get install -y git python3.9 pip cudnn9-cuda-12 pip install cmake==3.21.0 pybind11[global] ninja + git config --global --add safe.directory '*' - name: 'Checkout' uses: actions/checkout@v3 with: @@ -32,67 +33,53 @@ jobs: NVTE_FRAMEWORK: none MAX_JOBS: 1 SCCACHE_GHA_ENABLED: "true" + NVTE_CUDA_ARCHS: "100" - name: 'Sanity check' run: python3 -c "import transformer_engine" working-directory: / pytorch: name: 'PyTorch' runs-on: ubuntu-latest + container: + image: nvcr.io/nvidia/cuda:13.0.0-devel-ubuntu22.04 + options: --user root steps: - - name: Move /var/lib/docker/ - shell: bash -euxo pipefail {0} - run: sudo mv /var/lib/docker/ "${GITHUB_WORKSPACE}/docker" - - - name: Maximize build space - uses: easimon/maximize-build-space@c28619d8999a147d5e09c1199f84ff6af6ad5794 - with: - root-reserve-mb: 5120 - temp-reserve-mb: 32 - swap-size-mb: 10240 - remove-dotnet: 'true' - remove-android: 'true' - remove-haskell: 'true' - remove-codeql: 'true' - build-mount-path: '/var/lib/docker/' - - - name: Restore /var/lib/docker/ - shell: bash -euxo pipefail {0} - run: sudo sh -c "mv ${GITHUB_WORKSPACE}/docker/* /var/lib/docker" - + - name: 'Dependencies' + run: | + apt-get update + apt-get install -y git python3.9 pip cudnn9-cuda-12 + pip install cmake==3.21.0 pybind11[global] ninja pydantic importlib-metadata>=1.0 packaging numpy einops onnxscript + pip install torch --index-url https://download.pytorch.org/whl/cu130 + git config --global --add safe.directory '*' - name: 'Checkout' uses: actions/checkout@v3 with: submodules: recursive - - - name: Start named container - run: | - docker run -v $(pwd):$(pwd) -w $(pwd) --name builder -d nvcr.io/nvidia/cuda:12.8.0-devel-ubuntu22.04 sleep infinity - - - name: 'Dependencies' - run: | - docker exec builder bash -c '\ - apt-get update && \ - apt-get install -y git python3.9 pip cudnn9-cuda-12 && \ - pip install cmake torch ninja pydantic importlib-metadata>=1.0 packaging pybind11 numpy einops onnxscript && \ - apt-get clean \ - ' - + - name: ccache + uses: mozilla-actions/sccache-action@7d986dd989559c6ecdb630a3fd2557667be217ad - name: 'Build' - run: docker exec builder bash -c 'pip install --no-build-isolation . -v --no-deps' + run: NVTE_USE_CCACHE=1 NVTE_CCACHE_BIN=sccache pip install --no-build-isolation . -v --no-deps env: NVTE_FRAMEWORK: pytorch MAX_JOBS: 1 + SCCACHE_GHA_ENABLED: "true" + NVTE_CUDA_ARCHS: "100" - name: 'Sanity check' - run: docker exec builder bash -c 'python3 tests/pytorch/test_sanity_import.py' + run: python3 tests/pytorch/test_sanity_import.py jax: name: 'JAX' runs-on: ubuntu-latest container: - image: ghcr.io/nvidia/jax:jax + image: nvcr.io/nvidia/cuda:13.0.0-devel-ubuntu22.04 options: --user root steps: - name: 'Dependencies' - run: pip install cmake==3.21.0 pybind11[global] + run: | + apt-get update + apt-get install -y git python3.9 pip cudnn9-cuda-12 + pip install cmake==3.21.0 pybind11[global] ninja packaging + pip install jax[cuda13] flax[cuda13] + git config --global --add safe.directory '*' - name: 'Checkout' uses: actions/checkout@v3 with: @@ -100,57 +87,43 @@ jobs: - name: ccache uses: mozilla-actions/sccache-action@7d986dd989559c6ecdb630a3fd2557667be217ad - name: 'Build' - run: | - NVTE_CCACHE_BIN=sccache NVTE_USE_CCACHE=1 pip install --no-build-isolation . -v + run: NVTE_USE_CCACHE=1 NVTE_CCACHE_BIN=sccache pip install --no-build-isolation . -v env: NVTE_FRAMEWORK: jax MAX_JOBS: 1 SCCACHE_GHA_ENABLED: "true" + NVTE_CUDA_ARCHS: "100" - name: 'Sanity check' run: python3 tests/jax/test_sanity_import.py all: name: 'All' runs-on: ubuntu-latest + container: + image: nvcr.io/nvidia/cuda:13.0.0-devel-ubuntu22.04 + options: --user root steps: - - name: Move /var/lib/docker/ - shell: bash -euxo pipefail {0} - run: sudo mv /var/lib/docker/ "${GITHUB_WORKSPACE}/docker" - - - name: Maximize build space - uses: easimon/maximize-build-space@c28619d8999a147d5e09c1199f84ff6af6ad5794 - with: - root-reserve-mb: 5120 - temp-reserve-mb: 32 - swap-size-mb: 10240 - remove-dotnet: 'true' - remove-android: 'true' - remove-haskell: 'true' - remove-codeql: 'true' - build-mount-path: '/var/lib/docker/' - - - name: Restore /var/lib/docker/ - shell: bash -euxo pipefail {0} - run: sudo sh -c "mv ${GITHUB_WORKSPACE}/docker/* /var/lib/docker" - + - name: 'Dependencies' + run: | + apt-get update + apt-get install -y git python3.9 pip cudnn9-cuda-12 + pip install cmake==3.21.0 pybind11[global] ninja pydantic importlib-metadata>=1.0 packaging numpy einops onnxscript + pip install torch --index-url https://download.pytorch.org/whl/cu130 + pip install jax[cuda13] flax[cuda13] + git config --global --add safe.directory '*' - name: 'Checkout' uses: actions/checkout@v3 with: submodules: recursive - - - name: Start named container - run: | - docker run -v $(pwd):$(pwd) -w $(pwd) --name builder -d ghcr.io/nvidia/jax:jax sleep infinity - - - name: 'Dependencies' - run: | - docker exec builder bash -c '\ - pip install cmake==3.21.0 pybind11[global] einops onnxscript && \ - pip install torch --no-cache-dir --index-url https://download.pytorch.org/whl/cu130 - ' + - name: ccache + uses: mozilla-actions/sccache-action@7d986dd989559c6ecdb630a3fd2557667be217ad - name: 'Build' - run: docker exec builder bash -c 'pip install --no-cache-dir --no-build-isolation . -v --no-deps' + run: NVTE_USE_CCACHE=1 NVTE_CCACHE_BIN=sccache pip install --no-build-isolation . -v --no-deps env: NVTE_FRAMEWORK: all MAX_JOBS: 1 + SCCACHE_GHA_ENABLED: "true" + NVTE_CUDA_ARCHS: "100" - name: 'Sanity check' - run: docker exec builder bash -c 'python3 tests/pytorch/test_sanity_import.py && python3 tests/jax/test_sanity_import.py' + run: | + python3 tests/pytorch/test_sanity_import.py + python3 tests/jax/test_sanity_import.py diff --git a/.github/workflows/deploy_nightly_docs.yml b/.github/workflows/deploy_nightly_docs.yml index b4e015d2da..a8e5ee5ba2 100644 --- a/.github/workflows/deploy_nightly_docs.yml +++ b/.github/workflows/deploy_nightly_docs.yml @@ -7,6 +7,7 @@ name: Deploy nightly docs on: push: branches: [ "main" ] + workflow_dispatch: jobs: build: uses: ./.github/workflows/docs.yml @@ -21,9 +22,8 @@ jobs: name: "te_docs" path: "html" - name: Prepare for pages - uses: actions/upload-pages-artifact@v1.0.7 + uses: actions/upload-pages-artifact@v3 with: - name: github-pages path: "html" deploy: needs: prepare @@ -36,4 +36,5 @@ jobs: runs-on: ubuntu-latest steps: - name: Deploy - uses: actions/deploy-pages@v2.0.0 + id: deployment + uses: actions/deploy-pages@v4