Skip to content

Enroot Tests

Enroot Tests #21

Workflow file for this run

name: Enroot Tests
on:
push:
branches:
- main
workflow_dispatch:
inputs:
run_single_node_pytorch:
description: 'Run single-node PyTorch test'
required: false
type: boolean
default: true
run_multi_node_pytorch:
description: 'Run multi-node distributed PyTorch test'
required: false
type: boolean
default: true
run_multi_node_rccl:
description: 'Run multi-node RCCL test'
required: false
type: boolean
default: true
no_install:
description: 'Skip installation (--no-install)'
required: false
type: boolean
default: false
no_uninstall:
description: 'Skip uninstallation (--no-uninstall)'
required: false
type: boolean
default: false
docker_image_single_node:
description: 'Docker image for single-node test (default: rocm/pytorch:latest)'
required: false
type: string
default: ''
docker_image_multi_node:
description: 'Docker image for multi-node PyTorch test (default: docker://rocm/pytorch:rocm6.2.4_ubuntu22.04_py3.10_pytorch_release_2.3.0)'
required: false
type: string
default: ''
docker_image_rccl:
description: 'Docker image for RCCL test (default: docker://rocm/roce-workload:ubuntu24_rocm-7.0.2_rccl-7.0.2_anp-v1.2.0_ainic-1.117.5-a-56)'
required: false
type: string
default: ''
jobs:
# Single-node PyTorch test
test-single-node-pytorch:
if: github.event_name == 'push' || inputs.run_single_node_pytorch == true
runs-on: enroot-runners
timeout-minutes: 120
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.8'
- name: Install dependencies
run: |
python3 -m pip install --upgrade pip
pip install -r tests/enroot/requirements.txt
- name: Create testbed files from secrets
working-directory: tests/enroot
env:
SINGLE_NODE_TESTBED: ${{ secrets.SINGLE_NODE_TESTBED_FILE }}
run: |
mkdir -p testbed
if [ -n "$SINGLE_NODE_TESTBED" ]; then
printf '%s\n' "$SINGLE_NODE_TESTBED" > testbed/single_node_tb.yml
echo "Created testbed/single_node_tb.yml from secret"
else
echo "[ERROR] SINGLE_NODE_TESTBED_FILE secret is not set"
exit 1
fi
- name: Run test_single_node_pytorch
working-directory: tests/enroot
run: |
DOCKER_IMAGE="${{ inputs.docker_image_single_node }}"
if [ "${{ github.event_name }}" = "push" ]; then
NO_INSTALL="false"
NO_UNINSTALL="false"
else
NO_INSTALL="${{ inputs.no_install }}"
NO_UNINSTALL="${{ inputs.no_uninstall }}"
fi
echo "Running test: test_single_node_pytorch"
echo "Docker image: ${DOCKER_IMAGE:-'(using default from batch script)'}"
python3 run_test.py "test_single_node_pytorch" "$DOCKER_IMAGE" "$NO_INSTALL" "$NO_UNINSTALL" "testbed/single_node_tb.yml"
- name: Upload test results
if: always()
uses: actions/upload-artifact@v4
with:
name: test-results-single-node-pytorch-${{ github.run_number }}
path: tests/enroot/results/
if-no-files-found: warn
retention-days: 30
# Multi-node distributed PyTorch test
test-multi-node-pytorch:
needs: test-single-node-pytorch
if: |
always() &&
(github.event_name == 'push' || inputs.run_multi_node_pytorch == true) &&
(needs.test-single-node-pytorch.result == 'success' || needs.test-single-node-pytorch.result == 'skipped')
runs-on: enroot-runners
timeout-minutes: 120
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.8'
- name: Install dependencies
run: |
python3 -m pip install --upgrade pip
pip install -r tests/enroot/requirements.txt
- name: Create testbed files from secrets
working-directory: tests/enroot
env:
MULTI_NODE_TESTBED: ${{ secrets.MULTI_NODE_TESTBED_FILE }}
run: |
mkdir -p testbed
if [ -n "$MULTI_NODE_TESTBED" ]; then
printf '%s\n' "$MULTI_NODE_TESTBED" > testbed/multi_node_tb.yml
echo "Created testbed/multi_node_tb.yml from secret"
else
echo "[ERROR] MULTI_NODE_TESTBED_FILE secret is not set"
exit 1
fi
- name: Run test_multi_node_distributed_pytorch
working-directory: tests/enroot
run: |
DOCKER_IMAGE="${{ inputs.docker_image_multi_node }}"
if [ "${{ github.event_name }}" = "push" ]; then
NO_INSTALL="false"
NO_UNINSTALL="false"
else
NO_INSTALL="${{ inputs.no_install }}"
NO_UNINSTALL="${{ inputs.no_uninstall }}"
fi
echo "Running test: test_multi_node_distributed_pytorch"
echo "Docker image: ${DOCKER_IMAGE:-'(using default from batch script)'}"
python3 run_test.py "test_multi_node_distributed_pytorch" "$DOCKER_IMAGE" "$NO_INSTALL" "$NO_UNINSTALL" "testbed/multi_node_tb.yml"
- name: Upload test results
if: always()
uses: actions/upload-artifact@v4
with:
name: test-results-multi-node-pytorch-${{ github.run_number }}
path: tests/enroot/results/
if-no-files-found: warn
retention-days: 30
# Multi-node RCCL test
test-multi-node-rccl:
needs: test-multi-node-pytorch
if: |
always() &&
(github.event_name == 'push' || inputs.run_multi_node_rccl == true) &&
(needs.test-multi-node-pytorch.result == 'success' || needs.test-multi-node-pytorch.result == 'skipped')
runs-on: enroot-runners
timeout-minutes: 120
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.8'
- name: Install dependencies
run: |
python3 -m pip install --upgrade pip
pip install -r tests/enroot/requirements.txt
- name: Create testbed files from secrets
working-directory: tests/enroot
env:
MULTI_NODE_TESTBED: ${{ secrets.MULTI_NODE_TESTBED_FILE }}
run: |
mkdir -p testbed
if [ -n "$MULTI_NODE_TESTBED" ]; then
printf '%s\n' "$MULTI_NODE_TESTBED" > testbed/multi_node_tb.yml
echo "Created testbed/multi_node_tb.yml from secret"
else
echo "[ERROR] MULTI_NODE_TESTBED_FILE secret is not set"
exit 1
fi
- name: Run test_multi_node_rccl
working-directory: tests/enroot
run: |
DOCKER_IMAGE="${{ inputs.docker_image_rccl }}"
if [ "${{ github.event_name }}" = "push" ]; then
NO_INSTALL="false"
NO_UNINSTALL="false"
else
NO_INSTALL="${{ inputs.no_install }}"
NO_UNINSTALL="${{ inputs.no_uninstall }}"
fi
echo "Running test: test_multi_node_rccl"
echo "Docker image: ${DOCKER_IMAGE:-'(using default from batch script)'}"
python3 run_test.py "test_multi_node_rccl" "$DOCKER_IMAGE" "$NO_INSTALL" "$NO_UNINSTALL" "testbed/multi_node_tb.yml"
- name: Upload test results
if: always()
uses: actions/upload-artifact@v4
with:
name: test-results-multi-node-rccl-${{ github.run_number }}
path: tests/enroot/results/
if-no-files-found: warn
retention-days: 30