Skip to content

Add fused bias support for GMM and bias‑gradient/accumulate support f… #18

Add fused bias support for GMM and bias‑gradient/accumulate support f…

Add fused bias support for GMM and bias‑gradient/accumulate support f… #18

Workflow file for this run

name: Aiter Test
on:
push:
branches: [main]
pull_request:
branches: [main] # Triggers on PRs targeting `main`
workflow_dispatch:
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
env:
DOCKER_IMAGE: "rocm/pytorch:latest"
jobs:
check-signal:
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Download and check signal artifact
run: ./.github/scripts/check_signal.sh
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
GITHUB_SHA: ${{ github.sha }}
define-runners:
runs-on: ubuntu-latest
needs: [check-signal]
outputs:
standard_runners: ${{ steps.machines.outputs.standard_runners }}
multigpu_runners: ${{ steps.machines.outputs.multigpu_runners }}
steps:
- name: Define whether runs on MI35X
env:
PR_TITLE: ${{ github.event.pull_request.title }}
id: machines
run: |
set -euo pipefail
if [[ "${{ github.ref }}" == "refs/heads/main" ]]; then
echo "It's main branch, running tests on MI325 and MI35X..."
echo 'standard_runners=["aiter-mi355-1gpu"]' >> "$GITHUB_OUTPUT"
echo 'multigpu_runners=["aiter-mi355-8gpu"]' >> "$GITHUB_OUTPUT"
elif echo "${PR_TITLE}" | grep -qi "mi35x"; then
echo "PR title contains 'MI35X', running tests on MI325 and MI35X..."
echo 'standard_runners=["aiter-mi355-1gpu"]' >> "$GITHUB_OUTPUT"
echo 'multigpu_runners=["aiter-mi355-8gpu"]' >> "$GITHUB_OUTPUT"
else
echo "Not main branch and PR title does not contain mi35x, only running on MI325..."
echo 'standard_runners=["aiter-mi355-1gpu"]' >> "$GITHUB_OUTPUT"
echo 'multigpu_runners=["aiter-mi355-8gpu"]' >> "$GITHUB_OUTPUT"
fi
echo "$GITHUB_OUTPUT"
- name: Show output variable
run: |
echo "Standard: ${{ steps.machines.outputs.standard_runners }}"
echo "Multipe: ${{ steps.machines.outputs.multigpu_runners }}"
standard:
needs: define-runners
strategy:
fail-fast: false
matrix:
runner: ${{ fromJSON(needs.define-runners.outputs.standard_runners) }}
runs-on: ${{ matrix.runner }}
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Sync submodules
run: |
set -euo pipefail
if [[ ${{ github.ref }} == "refs/heads/main" ]]; then
echo "It's main branch, syncing latest CK..."
git submodule sync
git submodule update --init --recursive --remote --depth 1 --jobs 4
else
echo "It's a PR branch, syncing specific CK..."
git submodule sync
git submodule update --init --recursive --depth 1 --jobs 4
fi
- name: Clean up Rocm processes
run: |
./.github/scripts/clean_up_rocm.sh
- name: Run the container
run: |
set -ex
echo "Starting container: aiter_test"
if [ -f "/etc/podinfo/gha-render-devices" ]; then
DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices)
else
DEVICE_FLAG="--device /dev/dri"
fi
docker run -dt \
--device=/dev/kfd $DEVICE_FLAG \
--shm-size=16G \
--network=host \
--group-add $(getent group render | cut -d: -f3) \
--group-add $(getent group video | cut -d: -f3) \
-v "${{ github.workspace }}:/workspace" \
-w /workspace \
--name aiter_test \
${{ env.DOCKER_IMAGE }}
- name: Setup pip config
run: |
docker exec -u root aiter_test bash -c "pip config set global.default-timeout 60"
docker exec -u root aiter_test bash -c "pip config set global.retries 10"
- name: Setup Aiter
run: |
set -ex
echo "Setting up Aiter..."
docker exec \
-w /workspace \
aiter_test \
bash -c "BUILD_TRITON=0 ./.github/scripts/build_aiter_triton.sh"
- name: Tests
run: |
set -ex
docker exec \
-w /workspace \
aiter_test \
bash -c "MAX_JOBS=20 ./.github/scripts/aiter_test.sh"
- name: Upload test logs
uses: actions/upload-artifact@v4
if: always()
with:
name: standard-test-log-${{ matrix.runner }}
path: latest_test.log
- name: Cleanup container
if: always()
run: |
docker rm -f aiter_test || true
- name: Clean up Rocm processes
if: always()
run: |
./.github/scripts/clean_up_rocm.sh
multi-gpu:
needs: define-runners
# only run multi-gpu tests on main branch due to limited multi-gpu resources
if: github.ref == 'refs/heads/main'
strategy:
fail-fast: false
matrix:
runner: ${{ fromJSON(needs.define-runners.outputs.multigpu_runners) }}
runs-on: ${{ matrix.runner }}
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Sync submodules
run: |
set -euo pipefail
if [[ ${{ github.ref }} == "refs/heads/main" ]]; then
echo "It's main branch, syncing latest CK..."
git submodule sync
git submodule update --init --recursive --remote --depth 1 --jobs 4
else
echo "It's a PR branch, syncing specific CK..."
git submodule sync
git submodule update --init --recursive --depth 1 --jobs 4
fi
- name: Clean up Rocm processes
run: ./.github/scripts/clean_up_rocm.sh
- name: Run the container
run: |
set -ex
echo "Starting container: aiter_test"
if [ -f "/etc/podinfo/gha-render-devices" ]; then
DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices)
else
DEVICE_FLAG="--device /dev/dri"
fi
docker run -dt \
--device=/dev/kfd $DEVICE_FLAG \
--shm-size=16G \
--network=host \
--group-add $(getent group render | cut -d: -f3) \
--group-add $(getent group video | cut -d: -f3) \
-v "${{ github.workspace }}:/workspace" \
-w /workspace \
--name aiter_test \
${{ env.DOCKER_IMAGE }}
- name: Setup pip config
run: |
docker exec -u root aiter_test bash -c "pip config set global.default-timeout 60"
docker exec -u root aiter_test bash -c "pip config set global.retries 10"
- name: Setup-Aiter
run: |
set -ex
echo "Setting up Aiter..."
docker exec \
-w /workspace \
aiter_test \
bash -c "BUILD_TRITON=0 ./.github/scripts/build_aiter_triton.sh"
- name: Tests
run: |
set -ex
docker exec \
-e MULTIGPU=TRUE \
-w /workspace \
aiter_test \
bash -c "MAX_JOBS=20 ./.github/scripts/aiter_test.sh"
- name: Upload test logs
uses: actions/upload-artifact@v4
if: always()
with:
name: multigpu-test-${{ matrix.runner }}
path: latest_test.log
- name: Cleanup container
if: always()
run: |
docker rm -f aiter_test || true
- name: Clean up Rocm processes
if: always()
run: |
./.github/scripts/clean_up_rocm.sh