Enroot Tests #21
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Enroot Tests | |
| on: | |
| push: | |
| branches: | |
| - main | |
| workflow_dispatch: | |
| inputs: | |
| run_single_node_pytorch: | |
| description: 'Run single-node PyTorch test' | |
| required: false | |
| type: boolean | |
| default: true | |
| run_multi_node_pytorch: | |
| description: 'Run multi-node distributed PyTorch test' | |
| required: false | |
| type: boolean | |
| default: true | |
| run_multi_node_rccl: | |
| description: 'Run multi-node RCCL test' | |
| required: false | |
| type: boolean | |
| default: true | |
| no_install: | |
| description: 'Skip installation (--no-install)' | |
| required: false | |
| type: boolean | |
| default: false | |
| no_uninstall: | |
| description: 'Skip uninstallation (--no-uninstall)' | |
| required: false | |
| type: boolean | |
| default: false | |
| docker_image_single_node: | |
| description: 'Docker image for single-node test (default: rocm/pytorch:latest)' | |
| required: false | |
| type: string | |
| default: '' | |
| docker_image_multi_node: | |
| description: 'Docker image for multi-node PyTorch test (default: docker://rocm/pytorch:rocm6.2.4_ubuntu22.04_py3.10_pytorch_release_2.3.0)' | |
| required: false | |
| type: string | |
| default: '' | |
| docker_image_rccl: | |
| description: 'Docker image for RCCL test (default: docker://rocm/roce-workload:ubuntu24_rocm-7.0.2_rccl-7.0.2_anp-v1.2.0_ainic-1.117.5-a-56)' | |
| required: false | |
| type: string | |
| default: '' | |
| jobs: | |
| # Single-node PyTorch test | |
| test-single-node-pytorch: | |
| if: github.event_name == 'push' || inputs.run_single_node_pytorch == true | |
| runs-on: enroot-runners | |
| timeout-minutes: 120 | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| - name: Set up Python | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: '3.8' | |
| - name: Install dependencies | |
| run: | | |
| python3 -m pip install --upgrade pip | |
| pip install -r tests/enroot/requirements.txt | |
| - name: Create testbed files from secrets | |
| working-directory: tests/enroot | |
| env: | |
| SINGLE_NODE_TESTBED: ${{ secrets.SINGLE_NODE_TESTBED_FILE }} | |
| run: | | |
| mkdir -p testbed | |
| if [ -n "$SINGLE_NODE_TESTBED" ]; then | |
| printf '%s\n' "$SINGLE_NODE_TESTBED" > testbed/single_node_tb.yml | |
| echo "Created testbed/single_node_tb.yml from secret" | |
| else | |
| echo "[ERROR] SINGLE_NODE_TESTBED_FILE secret is not set" | |
| exit 1 | |
| fi | |
| - name: Run test_single_node_pytorch | |
| working-directory: tests/enroot | |
| run: | | |
| DOCKER_IMAGE="${{ inputs.docker_image_single_node }}" | |
| if [ "${{ github.event_name }}" = "push" ]; then | |
| NO_INSTALL="false" | |
| NO_UNINSTALL="false" | |
| else | |
| NO_INSTALL="${{ inputs.no_install }}" | |
| NO_UNINSTALL="${{ inputs.no_uninstall }}" | |
| fi | |
| echo "Running test: test_single_node_pytorch" | |
| echo "Docker image: ${DOCKER_IMAGE:-'(using default from batch script)'}" | |
| python3 run_test.py "test_single_node_pytorch" "$DOCKER_IMAGE" "$NO_INSTALL" "$NO_UNINSTALL" "testbed/single_node_tb.yml" | |
| - name: Upload test results | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: test-results-single-node-pytorch-${{ github.run_number }} | |
| path: tests/enroot/results/ | |
| if-no-files-found: warn | |
| retention-days: 30 | |
| # Multi-node distributed PyTorch test | |
| test-multi-node-pytorch: | |
| needs: test-single-node-pytorch | |
| if: | | |
| always() && | |
| (github.event_name == 'push' || inputs.run_multi_node_pytorch == true) && | |
| (needs.test-single-node-pytorch.result == 'success' || needs.test-single-node-pytorch.result == 'skipped') | |
| runs-on: enroot-runners | |
| timeout-minutes: 120 | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| - name: Set up Python | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: '3.8' | |
| - name: Install dependencies | |
| run: | | |
| python3 -m pip install --upgrade pip | |
| pip install -r tests/enroot/requirements.txt | |
| - name: Create testbed files from secrets | |
| working-directory: tests/enroot | |
| env: | |
| MULTI_NODE_TESTBED: ${{ secrets.MULTI_NODE_TESTBED_FILE }} | |
| run: | | |
| mkdir -p testbed | |
| if [ -n "$MULTI_NODE_TESTBED" ]; then | |
| printf '%s\n' "$MULTI_NODE_TESTBED" > testbed/multi_node_tb.yml | |
| echo "Created testbed/multi_node_tb.yml from secret" | |
| else | |
| echo "[ERROR] MULTI_NODE_TESTBED_FILE secret is not set" | |
| exit 1 | |
| fi | |
| - name: Run test_multi_node_distributed_pytorch | |
| working-directory: tests/enroot | |
| run: | | |
| DOCKER_IMAGE="${{ inputs.docker_image_multi_node }}" | |
| if [ "${{ github.event_name }}" = "push" ]; then | |
| NO_INSTALL="false" | |
| NO_UNINSTALL="false" | |
| else | |
| NO_INSTALL="${{ inputs.no_install }}" | |
| NO_UNINSTALL="${{ inputs.no_uninstall }}" | |
| fi | |
| echo "Running test: test_multi_node_distributed_pytorch" | |
| echo "Docker image: ${DOCKER_IMAGE:-'(using default from batch script)'}" | |
| python3 run_test.py "test_multi_node_distributed_pytorch" "$DOCKER_IMAGE" "$NO_INSTALL" "$NO_UNINSTALL" "testbed/multi_node_tb.yml" | |
| - name: Upload test results | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: test-results-multi-node-pytorch-${{ github.run_number }} | |
| path: tests/enroot/results/ | |
| if-no-files-found: warn | |
| retention-days: 30 | |
| # Multi-node RCCL test | |
| test-multi-node-rccl: | |
| needs: test-multi-node-pytorch | |
| if: | | |
| always() && | |
| (github.event_name == 'push' || inputs.run_multi_node_rccl == true) && | |
| (needs.test-multi-node-pytorch.result == 'success' || needs.test-multi-node-pytorch.result == 'skipped') | |
| runs-on: enroot-runners | |
| timeout-minutes: 120 | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| - name: Set up Python | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: '3.8' | |
| - name: Install dependencies | |
| run: | | |
| python3 -m pip install --upgrade pip | |
| pip install -r tests/enroot/requirements.txt | |
| - name: Create testbed files from secrets | |
| working-directory: tests/enroot | |
| env: | |
| MULTI_NODE_TESTBED: ${{ secrets.MULTI_NODE_TESTBED_FILE }} | |
| run: | | |
| mkdir -p testbed | |
| if [ -n "$MULTI_NODE_TESTBED" ]; then | |
| printf '%s\n' "$MULTI_NODE_TESTBED" > testbed/multi_node_tb.yml | |
| echo "Created testbed/multi_node_tb.yml from secret" | |
| else | |
| echo "[ERROR] MULTI_NODE_TESTBED_FILE secret is not set" | |
| exit 1 | |
| fi | |
| - name: Run test_multi_node_rccl | |
| working-directory: tests/enroot | |
| run: | | |
| DOCKER_IMAGE="${{ inputs.docker_image_rccl }}" | |
| if [ "${{ github.event_name }}" = "push" ]; then | |
| NO_INSTALL="false" | |
| NO_UNINSTALL="false" | |
| else | |
| NO_INSTALL="${{ inputs.no_install }}" | |
| NO_UNINSTALL="${{ inputs.no_uninstall }}" | |
| fi | |
| echo "Running test: test_multi_node_rccl" | |
| echo "Docker image: ${DOCKER_IMAGE:-'(using default from batch script)'}" | |
| python3 run_test.py "test_multi_node_rccl" "$DOCKER_IMAGE" "$NO_INSTALL" "$NO_UNINSTALL" "testbed/multi_node_tb.yml" | |
| - name: Upload test results | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: test-results-multi-node-rccl-${{ github.run_number }} | |
| path: tests/enroot/results/ | |
| if-no-files-found: warn | |
| retention-days: 30 |