Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
348 changes: 348 additions & 0 deletions .github/workflows/rocm-ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,348 @@
# Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
#
# See LICENSE for license information.

name: TransformerEngine CI

on:
push:
branches:
- 'dev'
- 'release_v1.*_rocm'
- 'release_v2.*_rocm'
pull_request:
branches:
- 'dev'
- 'release_v1.**_rocm'
- 'release_v2.**_rocm'
workflow_dispatch:
inputs:
test_level:
description: 'Test Level (1-3)'
required: true
default: '3'
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Change back to 1 before merging

skip_dev_merge:
description: 'Skip merging dev branch'
type: boolean
default: false

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

jobs:
build_and_test:
name: Build and Test on GPU
timeout-minutes: 720
runs-on: linux-mi325-8
steps:
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Need to add a step printing out the params, including repository variables

- name: Checkout repository
uses: actions/checkout@v4
with:
submodules: 'recursive'
fetch-depth: 0

- name: Merge origin/dev
# Only run on PRs targeting dev, or manual runs where we didn't skip it
if: |
(github.event_name == 'pull_request' && github.base_ref == 'dev') ||
(github.event_name == 'workflow_dispatch' && inputs.skip_dev_merge != 'true' && github.ref == 'refs/heads/dev')
run: |
echo "Attempting to merge origin/dev..."
git config --global user.email "[email protected]"
git config --global user.name "AMD CI"

# Fetch dev specifically
git fetch origin dev

# Attempt merge; this will exit with error code 1 if there is a conflict, failing the job
git merge origin/dev

# Update submodules after merge to ensure new files are present
echo "Updating submodules after merge..."
git submodule update --init --recursive

echo "Merge successful."

- name: Select Docker Image Tag
id: select-image
env:
DEV_IMAGE: ${{ vars.DEV_DOCKER_IMAGE }}
REL_IMAGE: ${{ vars.REL613_DOCKER_IMAGE }}
run: |
BRANCH_NAME="${{ github.base_ref || github.ref_name }}"
echo "Determining image for branch: $BRANCH_NAME"
DEV_DOCKER_IMAGE="$DEV_IMAGE"
REL613_DOCKER_IMAGE="$REL_IMAGE"
IMAGE_TO_USE="$DEV_DOCKER_IMAGE"
if [[ $BRANCH_NAME =~ ^release_v([0-9]+)\.([0-9]+)_rocm$ ]]; then
MAJOR_VERSION=${BASH_REMATCH[1]}
MINOR_VERSION=${BASH_REMATCH[2]}
if (( MAJOR_VERSION == 1 )); then
if (( MINOR_VERSION == 13 || MINOR_VERSION == 14 )); then IMAGE_TO_USE="$REL613_DOCKER_IMAGE"; fi
fi
fi
echo "Selected image: $IMAGE_TO_USE"
echo "image-tag=$IMAGE_TO_USE" >> $GITHUB_OUTPUT

- name: Pull Docker Image
run: |
docker pull ${{ steps.select-image.outputs.image-tag }}

- name: Run Container
run: |
docker run -dt \
--name te-runner \
--network=host \
--device=/dev/dri --device=/dev/kfd \
--shm-size=16G \
--pid=host \
--group-add $(getent group render | cut -d: -f3) \
--group-add $(getent group video | cut -d: -f3) \
-v "${{ github.workspace }}:/workspace" \
-w /workspace \
${{ steps.select-image.outputs.image-tag}}

- name: Diagnostics
run: |
# On the runner
rocm-smi
# In the container
docker exec te-runner rocm-smi

- name: Determine GPU Architecture via rocminfo
id: gpu-arch
run: |
# Run rocminfo inside the container and capture the output
ARCH=$(docker exec te-runner bash -c "rocminfo | grep -m 1 -oP 'gfx[0-9a-fA-F]+'")
if [ -z "$ARCH" ]; then
echo "::error::Could not determine GPU architecture using rocminfo inside the container."
# Optional: Print full rocminfo output for debugging
docker exec te-runner rocminfo
exit 1
fi
echo "Detected GPU Arch: $ARCH"
echo "arch=$ARCH" >> $GITHUB_OUTPUT

- name: Build Project
run: |
docker exec \
-e GPU_ARCH=${{ steps.gpu-arch.outputs.arch }} \
te-runner bash -c "$(cat <<'EOF'
set -ex

export HIP_PATH=""
export PYTORCH_ROCM_ARCH=$GPU_ARCH
export NVTE_ROCM_ARCH=$GPU_ARCH
export NVTE_AITER_PREBUILT_BASE_URL=https://compute-artifactory.amd.com:5000/artifactory/rocm-generic-local/te-ci/aiter-prebuilts
pip install ninja
pip install --no-build-isolation -v . 2>&1
EOF
)"

- name: Run sGPU tests
id: sgpu-tests
continue-on-error: true
run: |
# Cleanup previous failure markers if any. Don't actually do anything on k8s pods
rm -f FAIL_*

docker exec \
-e TEST_SGPU=1 \
-e TEST_LEVEL=${{ inputs.test_level || '3' }} \
te-runner bash -c "$(cat <<'EOF'
#!/usr/bin/bash
set -x -o pipefail
ulimit -c 0 # Disable core dumps

# debug output
ls -d /opt/rocm*
python --version
pip list | egrep "transformer_e|torch|jax|numpy|ml_dtypes|typing_ext"

HIP_VISIBLE_DEVICES=1 ci/pytorch.sh > /workspace/torch_sgpu.log 2>&1 &
torch_pid=$!; echo Pytorch test pid $!

HIP_VISIBLE_DEVICES=2 ci/jax.sh > /workspace/jax_sgpu.log 2>&1 &
jax_pid=$!; echo JAX test pid $!

HIP_VISIBLE_DEVICES=3 ci/core.sh > /workspace/core_sgpu.log 2>&1 &
core_pid=$!; echo Core test pid $!

wait $core_pid; core_rc=$?
wait $jax_pid; jax_rc=$?
wait $torch_pid; torch_rc=$?

# /workspace/FAIL_* files are for failure markers we can extract to the host runner and process later
# Check PyTorch
if [ $torch_rc -ne 0 ]; then
echo "::group::[FAILED] PyTorch sGPU Log"
cat /workspace/torch_sgpu.log
echo "::endgroup::"
echo "::error::Pytorch sGPU test FAILED."
touch /workspace/FAIL_TORCH_SGPU
fi

# Check JAX
if [ $jax_rc -ne 0 ]; then
echo "::group::[FAILED] JAX sGPU Log"
cat /workspace/jax_sgpu.log
echo "::endgroup::"
echo "::error::JAX sGPU test FAILED."
touch /workspace/FAIL_JAX_SGPU
fi

# Check Core
if [ $core_rc -ne 0 ]; then
echo "::group::[FAILED] Core sGPU Log"
cat /workspace/core_sgpu.log
echo "::endgroup::"
echo "::error::Core sGPU test FAILED."
touch /workspace/FAIL_CORE_SGPU
fi

test $torch_rc -eq 0 -a $jax_rc -eq 0 -a $core_rc -eq 0
EOF
)"

# Export failed tests statuses to host runner
if [ -f FAIL_TORCH_SGPU ]; then echo "torch=fail" >> $GITHUB_OUTPUT; fi
if [ -f FAIL_JAX_SGPU ]; then echo "jax=fail" >> $GITHUB_OUTPUT; fi
if [ -f FAIL_CORE_SGPU ]; then echo "core=fail" >> $GITHUB_OUTPUT; fi

- name: Run mGPU tests
id: mgpu-tests
continue-on-error: true
run: |
docker exec \
-e TEST_MGPU=1 \
-e TEST_LEVEL=${{ inputs.test_level || '3' }} \
te-runner bash -c "$(cat <<'EOF'
#!/usr/bin/bash
set -x -o pipefail
ulimit -c 0 # Disable core dumps

# Run PyTorch
ci/pytorch.sh > /workspace/torch_mgpu.log 2>&1
torch_rc=$?

# Run JAX
ci/jax.sh > /workspace/jax_mgpu.log 2>&1
jax_rc=$?

# /workspace/FAIL_* files are for failure markers we can extract to the host runner and process later
if [ $torch_rc -ne 0 ]; then
echo "::group::[FAILED] PyTorch mGPU Log"
cat /workspace/torch_mgpu.log
echo "::endgroup::"
echo "::error::Pytorch mGPU test FAILED."
touch /workspace/FAIL_TORCH_MGPU
fi

if [ $jax_rc -ne 0 ]; then
echo "::group::[FAILED] JAX mGPU Log"
cat /workspace/jax_mgpu.log
echo "::endgroup::"
echo "::error::JAX mGPU test FAILED."
touch /workspace/FAIL_JAX_MGPU
fi

test $torch_rc -eq 0 -a $jax_rc -eq 0
EOF
)"

# Export failed tests statuses to host runner
if [ -f FAIL_TORCH_MGPU ]; then echo "torch=fail" >> $GITHUB_OUTPUT; fi
if [ -f FAIL_JAX_MGPU ]; then echo "jax=fail" >> $GITHUB_OUTPUT; fi

- name: Run Examples
id: examples-tests
continue-on-error: true
run: |
docker exec te-runner bash -c "$(cat <<'EOF'
#!/usr/bin/bash
set -ex -o pipefail
ulimit -c 0 # Disable core dumps

cd /workspace/examples/pytorch/mnist
python main.py 2>&1 | tee /workspace/examples.log
python main.py --use-te 2>&1 | tee -a /workspace/examples.log
python main.py --use-fp8 2>&1 | tee -a /workspace/examples.log

cd /workspace/examples/jax/mnist
pip3 install -r requirements.txt
python test_single_gpu_mnist.py 2>&1 | tee -a /workspace/examples.log
python test_single_gpu_mnist.py --use-te 2>&1 | tee -a /workspace/examples.log
python test_single_gpu_mnist.py --use-fp8 2>&1 | tee -a /workspace/examples.log

cd /workspace/examples/jax/encoder
pip3 install -r requirements.txt
python test_single_gpu_encoder.py 2>&1 | tee -a /workspace/examples.log
python test_single_gpu_encoder.py --use-fp8 2>&1 | tee -a /workspace/examples.log
EOF
)"

- name: Check Test Failure Status
if: always()
run: |
EXIT_STATUS=0

# Check outcomes of the specific test steps
# "outcome" will be 'failure' even if continue-on-error was true
# sGPU CHECKS
if [[ "${{ steps.sgpu-tests.outputs.core }}" == "fail" ]]; then
echo "::error::Core sGPU Tests Failed."
EXIT_STATUS=1
fi
if [[ "${{ steps.sgpu-tests.outputs.torch }}" == "fail" ]]; then
echo "::error::PyTorch sGPU Tests Failed."
EXIT_STATUS=1
fi
if [[ "${{ steps.sgpu-tests.outputs.jax }}" == "fail" ]]; then
echo "::error::JAX sGPU Tests Failed."
EXIT_STATUS=1
fi

# mGPU CHECKS
if [[ "${{ steps.mgpu-tests.outputs.torch }}" == "fail" ]]; then
echo "::error::PyTorch mGPU Tests Failed."
EXIT_STATUS=1
fi
if [[ "${{ steps.mgpu-tests.outputs.jax }}" == "fail" ]]; then
echo "::error::JAX mGPU Tests Failed."
EXIT_STATUS=1
fi

# EXAMPLES CHECK
if [[ "${{ steps.examples-tests.outcome }}" == "failure" ]]; then
echo "::error::Example Tests Failed."
EXIT_STATUS=1
fi

# Fail the job if any errors were detected
if [[ "$EXIT_STATUS" == "1" ]]; then
exit 1
fi

- name: Copy logs and reports from container
if: always()
run: |
docker cp te-runner:/workspace/torch_sgpu.log ./torch_sgpu.log || true
docker cp te-runner:/workspace/jax_sgpu.log ./jax_sgpu.log || true
docker cp te-runner:/workspace/core_sgpu.log ./core_sgpu.log || true
docker cp te-runner:/workspace/torch_mgpu.log ./torch_mgpu.log || true
docker cp te-runner:/workspace/jax_mgpu.log ./jax_mgpu.log || true

- name: Upload logs and test reports
if: always()
uses: actions/upload-artifact@v4
with:
name: logs-and-reports
path: |
*.log
if-no-files-found: ignore
retention-days: 5

- name: Cleanup container
if: always()
run: docker rm -f te-runner || true