NVIDIA · ksivaman · Nov 21, 2025 · Nov 18, 2025 · Nov 18, 2025 · Nov 18, 2025
diff --git a/.github/actions/build-pytorch-wheel/Dockerfile b/.github/actions/build-pytorch-wheel/Dockerfile
@@ -0,0 +1,49 @@
+# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+FROM ubuntu:22.04
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+ENV CUDA_HOME=/usr/local/cuda
+ENV PATH=$PATH:$CUDA_HOME/bin
+ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+ENV TORCH_CUDA_ARCH_LIST="6.0;6.1;7.0;7.5;8.0;8.6;9.0"
+
+ARG PYTHON_VERSION=3.12
+ARG TORCH_VERSION=2.9.1
+ARG CUDA_VERSION=12.9.1
+ARG CUDNN_MAJOR_VERSION=9
+ENV PATH=/opt/venv/bin:$PATH
+ENV PYTHONUNBUFFERED=1
+ARG AARCH=x86_64
+
+# Install Python
+RUN apt-get update && \
+    apt-get install -y software-properties-common wget && \
+    add-apt-repository ppa:deadsnakes/ppa -y && \
+    apt-get install -y python$PYTHON_VERSION-dev python$PYTHON_VERSION-venv python3-pip && \
+    python$PYTHON_VERSION -m venv /opt/venv
+
+
+# Install cuda-toolkit
+RUN CUDA_MAJOR_VERSION=$(echo $CUDA_VERSION | awk -F \. {'print $1'}) && \
+    CUDA_MINOR_VERSION=$(echo $CUDA_VERSION | awk -F \. {'print $2'}) && \
+    rm /etc/apt/sources.list.d/cuda*.list || true && \
+    rm /etc/apt/sources.list.d/nvidia-cuda.list || true && \
+    wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/${AARCH}/cuda-keyring_1.1-1_all.deb && \
+    dpkg -i cuda-keyring_1.1-1_all.deb && \
+    rm cuda-keyring_1.1-1_all.deb && \
+    apt-get update && \
+    apt-get install -y cuda-toolkit-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} cudnn-cuda-$CUDA_MAJOR_VERSION libcudnn$CUDNN_MAJOR_VERSION-cuda-$CUDA_MAJOR_VERSION libnccl2 libnccl-dev cmake
+
+# Install PyTorch
+RUN export MATRIX_CUDA_VERSION=$(echo $CUDA_VERSION | awk -F \. {'print $1 $2'}) && \
+    export MATRIX_TORCH_VERSION=$(echo $TORCH_VERSION | awk -F \. {'print $1 "." $2'}) && \
+    export TORCH_CUDA_VERSION=$(python -c "from os import environ as env; \ 
+    minv = {'2.5': 118, '2.6': 118, '2.7': 118, '2.8': 126, '2.9': 126}[env['MATRIX_TORCH_VERSION']]; \
+    maxv = {'2.5': 124, '2.6': 126, '2.7': 128, '2.8': 129, '2.9': 130}[env['MATRIX_TORCH_VERSION']]; \
+    print(minv if int(env['MATRIX_CUDA_VERSION']) < 120 else maxv)" \
+    ) && \
+    pip install --no-cache-dir torch==${TORCH_VERSION} --index-url https://download.pytorch.org/whl/cu${TORCH_CUDA_VERSION}
diff --git a/.github/actions/build-pytorch-wheel/action.yml b/.github/actions/build-pytorch-wheel/action.yml
@@ -0,0 +1,118 @@
+# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+name: Build PyTorch Wheel
+description: Builds a PyTorch wheel for TransformerEngine
+
+inputs:
+  release-version:
+    description: 'The release version to use for the build'
+    required: true
+  python-version:
+    description: 'The Python version to use for the build'
+    required: true
+  cuda-version:
+    description: 'The CUDA version to use for the build'
+    required: true
+  cudnn-version:
+    description: 'The cuDNN version to use for the build'
+    required: true
+  torch-version:
+    description: 'The PyTorch version to use for the build'
+    required: true
+  cxx11_abi:
+    description: 'Enable torch flag C++11 ABI (TRUE/FALSE)'
+    required: true
+  base-image:
+    description: 'The base image to use for the build'
+    required: false
+  aarch:
+    description: 'The architecture to use for the build'
+    required: true
+outputs:
+  wheel_name:
+    description: 'The name of the built wheel'
+    value: ${{ steps.build_wheel.outputs.wheel_name }}
+
+runs:
+  using: 'composite'
+  steps:
+    - name: Move /var/lib/docker/
+      shell: bash -euxo pipefail {0}
+      run: sudo mv /var/lib/docker/ "${GITHUB_WORKSPACE}/docker"
+
+    - name: Maximize build space
+      uses: easimon/maximize-build-space@c28619d8999a147d5e09c1199f84ff6af6ad5794
+      with:
+        root-reserve-mb: 5120
+        temp-reserve-mb: 32
+        swap-size-mb: 10240
+        remove-dotnet: 'true'
+        remove-android: 'true'
+        remove-haskell: 'true'
+        remove-codeql: 'true'
+        build-mount-path: '/var/lib/docker/'
+
+    - name: Restore /var/lib/docker/
+      shell: bash -euxo pipefail {0}
+      run: sudo sh -c "mv ${GITHUB_WORKSPACE}/docker/* /var/lib/docker"
+
+    - name: Checkout
+      uses: actions/checkout@v4
+      with:
+        ref: ${{ inputs.release-version }}
+        submodules: recursive
+
+    - name: Checkout build tools
+      uses: actions/checkout@v4
+      with:
+        path: build-tools
+        submodules: recursive
+
+    - name: Build image
+      shell: bash -euxo pipefail {0}
+      env:
+        BASE_IMAGE: ${{ inputs.base-image }}
+      run: |
+        if [[ "${BASE_IMAGE}" == "" ]]; then
+          docker build \
+            -t transformer-engine-build \
+            -f build-tools/.github/actions/build-pytorch-wheel/Dockerfile \
+            --build-arg PYTHON_VERSION=${{ inputs.python-version }} \
+            --build-arg TORCH_VERSION=${{ inputs.torch-version }} \
+            --build-arg CUDA_VERSION=${{ inputs.cuda-version }} \
+            --build-arg CUDNN_MAJOR_VERSION=${{ inputs.cudnn-version }} \
+            --build-arg AARCH=${{ inputs.aarch }} \
+            .
+        else
+          docker pull ${BASE_IMAGE}
+          docker tag ${BASE_IMAGE} transformer-engine-build
+        fi
+    - name: Build wheel
+      shell: bash -euxo pipefail {0}
+      id: build_wheel
+      env:
+        CXX11_ABI: ${{ inputs.cxx11_abi }}
+      run: |
+        echo ::group::Build wheel
+
+        EXIT_CODE=$(docker run \
+            --rm \
+            --shm-size=64g \
+            --workdir /workspace/transformer_engine/pytorch \
+            --volume $(pwd):/workspace \
+            --volume $GITHUB_OUTPUT:$GITHUB_OUTPUT \
+            -e PIP_CONSTRAINT= \
+            -e CXX11_ABI=$CXX11_ABI \
+            -e GITHUB_OUTPUT=$GITHUB_OUTPUT \
+            transformer-engine-build bash /workspace/build-tools/.github/actions/build-pytorch-wheel/build.sh | tail -n 1)
+
+        # Do not fail the job if timeout killed the build
+        exit $EXIT_CODE
+        echo ::endgroup::
+
+    - name: Log Built Wheels
+      shell: bash -euxo pipefail {0}
+      run: |
+        ls transformer_engine/pytorch/dist
diff --git a/.github/actions/build-pytorch-wheel/build.sh b/.github/actions/build-pytorch-wheel/build.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+set -eoxu pipefail
+
+export NVTE_PYTORCH_FORCE_BUILD=TRUE
+export NVTE_NO_LOCAL_VERSION=1
+export NVTE_PYTORCH_FORCE_CXX11_ABI=$CXX11_ABI
+export PIP_CONSTRAINT=
+
+pip install wheel packaging nvidia-mathdx ninja pybind11
+
+# 5h timeout since GH allows max 6h and we want some buffer
+EXIT_CODE=0
+timeout 5h python setup.py bdist_wheel --dist-dir=dist || EXIT_CODE=$?
+
+if [ $EXIT_CODE -eq 0 ]; then
+    wheel_name=$(python -c "import setup; print(setup.get_wheel_url()[1])" | tail -n 1)
+    ls dist/*whl |xargs -I {} mv {} dist/${wheel_name}
+    echo "wheel_name=${wheel_name}" | tee -a "$GITHUB_OUTPUT"
+fi
+
+echo $EXIT_CODE
diff --git a/.github/scripts/check_for_ngc_images.sh b/.github/scripts/check_for_ngc_images.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+
+# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+# Configuration
+BASE_IMAGE="nvcr.io/nvidia/pytorch"
+TAG_SUFFIX="-py3"
+MONTHS_TO_CHECK=5 # Check current month and previous 4 months (total 5)
+
+# Initialize an array to store existing tags
+EXISTING_TAGS=()
+
+echo "Checking for existence of the last ${MONTHS_TO_CHECK} NGC PyTorch images: ${BASE_IMAGE}:YY.MM${TAG_SUFFIX}"
+echo "---------------------------------------------------------------------"
+
+# Loop through the last N months
+for i in $(seq 0 $((MONTHS_TO_CHECK - 1))); do
+    # Calculate Year and Month for the tag
+    CURRENT_YEAR=$(date +%Y)
+    CURRENT_MONTH=$(date +%m)
+
+    # Calculate target month and year
+    TARGET_DATE=$(date -d "$CURRENT_YEAR-$CURRENT_MONTH-01 -$i months" +%y.%m)
+
+    # Construct the full image tag and the tag-only string
+    IMAGE_TAG="${TARGET_DATE}${TAG_SUFFIX}"
+    FULL_IMAGE="${BASE_IMAGE}:${IMAGE_TAG}"
+
+    echo "Checking: ${FULL_IMAGE}"
+
+    # Use 'docker manifest inspect' to check for image existence without pulling.
+    if docker manifest inspect "${FULL_IMAGE}" > /dev/null 2>&1; then
+        echo "✅ EXISTS: Found."
+        # Add the tag-only string to the array
+        EXISTING_TAGS+=("nvcr.io/nvidia/pytorch:${IMAGE_TAG}")
+    else
+        echo "❌ MISSING: Not found."
+    fi
+done
+
+echo "---------------------------------------------------------------------"
+
+## JSON Output Generation
+# This uses the collected array to build a JSON string.
+
+# 1. Convert the shell array to a newline-separated string.
+TAGS_NL_SEP=$(printf "%s\n" "${EXISTING_TAGS[@]}")
+
+# 2. Use jq to read the newline-separated list and format it into a JSON array.
+# . | split("\n") | .[:-1] reads the input, splits it by newline, and removes the trailing empty element.
+if command -v jq &> /dev/null; then
+    JSON_STRING=$(echo -e "${TAGS_NL_SEP}" | jq -R -s 'split("\n") | .[:-1]')
+
+    echo "Generated JSON String of Existing Tags:"
+    echo "${JSON_STRING}"
+
+    # Optional: Save the JSON string to a variable for further use
+    # echo "JSON_STRING is now available in the shell if you source this script."
+else
+    echo "WARNING: 'jq' is not installed. Cannot format output as JSON."
+    echo "Found Tags: ${EXISTING_TAGS[*]}"
+fi
+
+echo "---"
+echo "Check complete."
+
+echo "${JSON_STRING}" > ngc_images.json