Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 49 additions & 0 deletions .github/actions/build-pytorch-wheel/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# See LICENSE for license information.

FROM ubuntu:22.04

ENV DEBIAN_FRONTEND=noninteractive

ENV CUDA_HOME=/usr/local/cuda
ENV PATH=$PATH:$CUDA_HOME/bin
ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
ENV TORCH_CUDA_ARCH_LIST="6.0;6.1;7.0;7.5;8.0;8.6;9.0"

ARG PYTHON_VERSION=3.12
ARG TORCH_VERSION=2.9.1
ARG CUDA_VERSION=12.9.1
ARG CUDNN_MAJOR_VERSION=9
ENV PATH=/opt/venv/bin:$PATH
ENV PYTHONUNBUFFERED=1
ARG AARCH=x86_64

# Install Python
RUN apt-get update && \
apt-get install -y software-properties-common wget && \
add-apt-repository ppa:deadsnakes/ppa -y && \
apt-get install -y python$PYTHON_VERSION-dev python$PYTHON_VERSION-venv python3-pip && \
python$PYTHON_VERSION -m venv /opt/venv


# Install cuda-toolkit
RUN CUDA_MAJOR_VERSION=$(echo $CUDA_VERSION | awk -F \. {'print $1'}) && \
CUDA_MINOR_VERSION=$(echo $CUDA_VERSION | awk -F \. {'print $2'}) && \
rm /etc/apt/sources.list.d/cuda*.list || true && \
rm /etc/apt/sources.list.d/nvidia-cuda.list || true && \
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/${AARCH}/cuda-keyring_1.1-1_all.deb && \
dpkg -i cuda-keyring_1.1-1_all.deb && \
rm cuda-keyring_1.1-1_all.deb && \
apt-get update && \
apt-get install -y cuda-toolkit-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} cudnn-cuda-$CUDA_MAJOR_VERSION libcudnn$CUDNN_MAJOR_VERSION-cuda-$CUDA_MAJOR_VERSION libnccl2 libnccl-dev cmake

# Install PyTorch
RUN export MATRIX_CUDA_VERSION=$(echo $CUDA_VERSION | awk -F \. {'print $1 $2'}) && \
export MATRIX_TORCH_VERSION=$(echo $TORCH_VERSION | awk -F \. {'print $1 "." $2'}) && \
export TORCH_CUDA_VERSION=$(python -c "from os import environ as env; \
minv = {'2.5': 118, '2.6': 118, '2.7': 118, '2.8': 126, '2.9': 126}[env['MATRIX_TORCH_VERSION']]; \
maxv = {'2.5': 124, '2.6': 126, '2.7': 128, '2.8': 129, '2.9': 130}[env['MATRIX_TORCH_VERSION']]; \
print(minv if int(env['MATRIX_CUDA_VERSION']) < 120 else maxv)" \
) && \
pip install --no-cache-dir torch==${TORCH_VERSION} --index-url https://download.pytorch.org/whl/cu${TORCH_CUDA_VERSION}
118 changes: 118 additions & 0 deletions .github/actions/build-pytorch-wheel/action.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# See LICENSE for license information.

name: Build PyTorch Wheel
description: Builds a PyTorch wheel for TransformerEngine

inputs:
release-version:
description: 'The release version to use for the build'
required: true
python-version:
description: 'The Python version to use for the build'
required: true
cuda-version:
description: 'The CUDA version to use for the build'
required: true
cudnn-version:
description: 'The cuDNN version to use for the build'
required: true
torch-version:
description: 'The PyTorch version to use for the build'
required: true
cxx11_abi:
description: 'Enable torch flag C++11 ABI (TRUE/FALSE)'
required: true
base-image:
description: 'The base image to use for the build'
required: false
aarch:
description: 'The architecture to use for the build'
required: true
outputs:
wheel_name:
description: 'The name of the built wheel'
value: ${{ steps.build_wheel.outputs.wheel_name }}

runs:
using: 'composite'
steps:
- name: Move /var/lib/docker/
shell: bash -euxo pipefail {0}
run: sudo mv /var/lib/docker/ "${GITHUB_WORKSPACE}/docker"

- name: Maximize build space
uses: easimon/maximize-build-space@c28619d8999a147d5e09c1199f84ff6af6ad5794
with:
root-reserve-mb: 5120
temp-reserve-mb: 32
swap-size-mb: 10240
remove-dotnet: 'true'
remove-android: 'true'
remove-haskell: 'true'
remove-codeql: 'true'
build-mount-path: '/var/lib/docker/'

- name: Restore /var/lib/docker/
shell: bash -euxo pipefail {0}
run: sudo sh -c "mv ${GITHUB_WORKSPACE}/docker/* /var/lib/docker"

- name: Checkout
uses: actions/checkout@v4
with:
ref: ${{ inputs.release-version }}
submodules: recursive

- name: Checkout build tools
uses: actions/checkout@v4
with:
path: build-tools
submodules: recursive

- name: Build image
shell: bash -euxo pipefail {0}
env:
BASE_IMAGE: ${{ inputs.base-image }}
run: |
if [[ "${BASE_IMAGE}" == "" ]]; then
docker build \
-t transformer-engine-build \
-f build-tools/.github/actions/build-pytorch-wheel/Dockerfile \
--build-arg PYTHON_VERSION=${{ inputs.python-version }} \
--build-arg TORCH_VERSION=${{ inputs.torch-version }} \
--build-arg CUDA_VERSION=${{ inputs.cuda-version }} \
--build-arg CUDNN_MAJOR_VERSION=${{ inputs.cudnn-version }} \
--build-arg AARCH=${{ inputs.aarch }} \
.
else
docker pull ${BASE_IMAGE}
docker tag ${BASE_IMAGE} transformer-engine-build
fi
- name: Build wheel
shell: bash -euxo pipefail {0}
id: build_wheel
env:
CXX11_ABI: ${{ inputs.cxx11_abi }}
run: |
echo ::group::Build wheel

EXIT_CODE=$(docker run \
--rm \
--shm-size=64g \
--workdir /workspace/transformer_engine/pytorch \
--volume $(pwd):/workspace \
--volume $GITHUB_OUTPUT:$GITHUB_OUTPUT \
-e PIP_CONSTRAINT= \
-e CXX11_ABI=$CXX11_ABI \
-e GITHUB_OUTPUT=$GITHUB_OUTPUT \
transformer-engine-build bash /workspace/build-tools/.github/actions/build-pytorch-wheel/build.sh | tail -n 1)

# Do not fail the job if timeout killed the build
exit $EXIT_CODE
echo ::endgroup::

- name: Log Built Wheels
shell: bash -euxo pipefail {0}
run: |
ls transformer_engine/pytorch/dist
26 changes: 26 additions & 0 deletions .github/actions/build-pytorch-wheel/build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#!/bin/bash

# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# See LICENSE for license information.

set -eoxu pipefail

export NVTE_PYTORCH_FORCE_BUILD=TRUE
export NVTE_NO_LOCAL_VERSION=1
export NVTE_PYTORCH_FORCE_CXX11_ABI=$CXX11_ABI
export PIP_CONSTRAINT=

pip install wheel packaging nvidia-mathdx ninja pybind11

# 5h timeout since GH allows max 6h and we want some buffer
EXIT_CODE=0
timeout 5h python setup.py bdist_wheel --dist-dir=dist || EXIT_CODE=$?

if [ $EXIT_CODE -eq 0 ]; then
wheel_name=$(python -c "import setup; print(setup.get_wheel_url()[1])" | tail -n 1)
ls dist/*whl |xargs -I {} mv {} dist/${wheel_name}
echo "wheel_name=${wheel_name}" | tee -a "$GITHUB_OUTPUT"
fi

echo $EXIT_CODE
69 changes: 69 additions & 0 deletions .github/scripts/check_for_ngc_images.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#!/bin/bash

# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# See LICENSE for license information.

# Configuration
BASE_IMAGE="nvcr.io/nvidia/pytorch"
TAG_SUFFIX="-py3"
MONTHS_TO_CHECK=5 # Check current month and previous 4 months (total 5)

# Initialize an array to store existing tags
EXISTING_TAGS=()

echo "Checking for existence of the last ${MONTHS_TO_CHECK} NGC PyTorch images: ${BASE_IMAGE}:YY.MM${TAG_SUFFIX}"
echo "---------------------------------------------------------------------"

# Loop through the last N months
for i in $(seq 0 $((MONTHS_TO_CHECK - 1))); do
# Calculate Year and Month for the tag
CURRENT_YEAR=$(date +%Y)
CURRENT_MONTH=$(date +%m)

# Calculate target month and year
TARGET_DATE=$(date -d "$CURRENT_YEAR-$CURRENT_MONTH-01 -$i months" +%y.%m)

# Construct the full image tag and the tag-only string
IMAGE_TAG="${TARGET_DATE}${TAG_SUFFIX}"
FULL_IMAGE="${BASE_IMAGE}:${IMAGE_TAG}"

echo "Checking: ${FULL_IMAGE}"

# Use 'docker manifest inspect' to check for image existence without pulling.
if docker manifest inspect "${FULL_IMAGE}" > /dev/null 2>&1; then
echo "✅ EXISTS: Found."
# Add the tag-only string to the array
EXISTING_TAGS+=("nvcr.io/nvidia/pytorch:${IMAGE_TAG}")
else
echo "❌ MISSING: Not found."
fi
done

echo "---------------------------------------------------------------------"

## JSON Output Generation
# This uses the collected array to build a JSON string.

# 1. Convert the shell array to a newline-separated string.
TAGS_NL_SEP=$(printf "%s\n" "${EXISTING_TAGS[@]}")

# 2. Use jq to read the newline-separated list and format it into a JSON array.
# . | split("\n") | .[:-1] reads the input, splits it by newline, and removes the trailing empty element.
if command -v jq &> /dev/null; then
JSON_STRING=$(echo -e "${TAGS_NL_SEP}" | jq -R -s 'split("\n") | .[:-1]')

echo "Generated JSON String of Existing Tags:"
echo "${JSON_STRING}"

# Optional: Save the JSON string to a variable for further use
# echo "JSON_STRING is now available in the shell if you source this script."
else
echo "WARNING: 'jq' is not installed. Cannot format output as JSON."
echo "Found Tags: ${EXISTING_TAGS[*]}"
fi

echo "---"
echo "Check complete."

echo "${JSON_STRING}" > ngc_images.json
Loading
Loading