feat: add GPU topology discovery and unit tests #956
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Iris External Validation Test | |
| on: | |
| push: | |
| branches: [ main ] | |
| pull_request: | |
| branches: [ main ] | |
| workflow_dispatch: | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.head_ref || github.ref }} | |
| cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} | |
| env: | |
| DOCKER_IMAGE_NAME: ${{ vars.DOCKER_IMAGE_NAME || 'iris-dev-triton-aafec41' }} | |
| jobs: | |
| external-validation-test: | |
| name: External Validation Test | |
| runs-on: [linux-mi325-8gpu-ossci-rad] | |
| timeout-minutes: 180 | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| - name: Setup Apptainer (if not available) | |
| run: | | |
| if ! command -v apptainer &> /dev/null && ! command -v docker &> /dev/null; then | |
| echo "Neither Apptainer nor Docker found, installing Apptainer..." | |
| apt-get update && apt-get install -y software-properties-common | |
| add-apt-repository -y ppa:apptainer/ppa | |
| apt-get update && apt-get install -y apptainer | |
| else | |
| echo "Container runtime already available" | |
| fi | |
| - name: Build Iris container | |
| run: | | |
| bash .github/scripts/container_build.sh | |
| - name: Acquire GPUs | |
| run: | | |
| bash .github/scripts/acquire_gpus.sh 2 | |
| - name: Run External Validation Test | |
| run: | | |
| set -e | |
| echo "::group::Running external validation test" | |
| bash .github/scripts/container_exec.sh " | |
| set -e | |
| cd /iris_workspace | |
| pip install git+https://github.com/${{ github.repository }}.git@${{ github.sha }} | |
| wget -O test_iris_distributed.py https://gist.githubusercontent.com/mawad-amd/6375dc078e39e256828f379e03310ec7/raw/0827d023eaf8e9755b17cbe8ab06f2ce258e746a/test_iris_distributed.py | |
| torchrun --nproc_per_node=2 test_iris_distributed.py | |
| " | |
| echo "::endgroup::" | |
| echo "✅ External validation test passed!" | |
| - name: Release GPUs | |
| if: always() | |
| run: | | |
| bash .github/scripts/release_gpus.sh | |
| external-gluon-validation-test: | |
| name: External Gluon Validation Test | |
| runs-on: [linux-mi325-8gpu-ossci-rad] | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| - name: Setup Apptainer (if not available) | |
| run: | | |
| if ! command -v apptainer &> /dev/null && ! command -v docker &> /dev/null; then | |
| echo "Neither Apptainer nor Docker found, installing Apptainer..." | |
| apt-get update && apt-get install -y software-properties-common | |
| add-apt-repository -y ppa:apptainer/ppa | |
| apt-get update && apt-get install -y apptainer | |
| else | |
| echo "Container runtime already available" | |
| fi | |
| - name: Build Iris container | |
| run: | | |
| bash .github/scripts/container_build.sh | |
| - name: Acquire GPUs | |
| run: | | |
| bash .github/scripts/acquire_gpus.sh 2 | |
| - name: Run External Gluon Validation Test | |
| run: | | |
| set -e | |
| echo "::group::Running external gluon validation test" | |
| bash .github/scripts/container_exec.sh --gpus "$GPU_DEVICES" " | |
| set -e | |
| cd /iris_workspace | |
| pip install git+https://github.com/${{ github.repository }}.git@${{ github.sha }} | |
| wget -O test_iris_gluon_distributed.py https://gist.githubusercontent.com/mawad-amd/2666dde8ebe2755eb0c4f2108709fcd5/raw/c5544943e2832c75252160bd9084600bf01a6b06/test_iris_gluon_distributed.py | |
| torchrun --nproc_per_node=2 test_iris_gluon_distributed.py | |
| " | |
| echo "::endgroup::" | |
| echo "✅ External gluon validation test passed!" | |
| - name: Release GPUs | |
| if: always() | |
| run: | | |
| bash .github/scripts/release_gpus.sh |