Enroot Tests #13
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Enroot Tests | |
| on: | |
| push: | |
| branches: | |
| - main | |
| workflow_dispatch: | |
| inputs: | |
| run_single_node_test: | |
| description: 'Run single-node PyTorch test' | |
| required: false | |
| type: boolean | |
| default: true | |
| run_multi_node_test: | |
| description: 'Run multi-node distributed PyTorch test' | |
| required: false | |
| type: boolean | |
| default: true | |
| run_rccl_test: | |
| description: 'Run multi-node RCCL test' | |
| required: false | |
| type: boolean | |
| default: true | |
| base_image_single_node: | |
| description: 'Docker image for single-node test (default: rocm/pytorch:latest from batch script)' | |
| required: false | |
| type: string | |
| default: '' | |
| base_image_multi_node: | |
| description: 'Docker image for multi-node test (default: docker://rocm/pytorch:rocm7.0.2_ubuntu22.04_py3.10_pytorch_release_2.7.1 from batch script)' | |
| required: false | |
| type: string | |
| default: '' | |
| base_image_rccl: | |
| description: 'Docker image for RCCL test (default: docker://rocm/roce-workload:ubuntu24_rocm-7.0.2_rccl-7.0.2_anp-v1.2.0_ainic-1.117.5-a-56 from batch script)' | |
| required: false | |
| type: string | |
| default: '' | |
| no_install: | |
| description: 'Skip installation (--no-install)' | |
| required: false | |
| type: boolean | |
| default: false | |
| no_uninstall: | |
| description: 'Skip uninstallation (--no-uninstall)' | |
| required: false | |
| type: boolean | |
| default: false | |
| testbed_file: | |
| description: 'Path to testbed file (overrides secret-based testbed). If not provided, uses SINGLE_NODE_TESTBED_FILE or MULTI_NODE_TESTBED_FILE secrets (which should contain YAML content).' | |
| required: false | |
| type: string | |
| default: '' | |
| jobs: | |
| run-enroot-tests: | |
| runs-on: enroot-runners | |
| timeout-minutes: 120 | |
| strategy: | |
| matrix: | |
| test_name: | |
| - test_single_node_pytorch | |
| - test_multi_node_distributed_pytorch | |
| - test_multi_node_rccl | |
| max-parallel: 1 # Run tests sequentially | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| if: | | |
| ${{ | |
| github.event_name == 'push' || | |
| (github.event_name == 'workflow_dispatch' && ( | |
| (matrix.test_name == 'test_single_node_pytorch' && inputs.run_single_node_test == true) || | |
| (matrix.test_name == 'test_multi_node_distributed_pytorch' && inputs.run_multi_node_test == true) || | |
| (matrix.test_name == 'test_multi_node_rccl' && inputs.run_rccl_test == true) | |
| )) | |
| }} | |
| - name: Set up Python | |
| uses: actions/setup-python@v5 | |
| if: | | |
| ${{ | |
| github.event_name == 'push' || | |
| (github.event_name == 'workflow_dispatch' && ( | |
| (matrix.test_name == 'test_single_node_pytorch' && inputs.run_single_node_test == true) || | |
| (matrix.test_name == 'test_multi_node_distributed_pytorch' && inputs.run_multi_node_test == true) || | |
| (matrix.test_name == 'test_multi_node_rccl' && inputs.run_rccl_test == true) | |
| )) | |
| }} | |
| with: | |
| python-version: '3.8' | |
| - name: Install dependencies | |
| if: | | |
| ${{ | |
| github.event_name == 'push' || | |
| (github.event_name == 'workflow_dispatch' && ( | |
| (matrix.test_name == 'test_single_node_pytorch' && inputs.run_single_node_test == true) || | |
| (matrix.test_name == 'test_multi_node_distributed_pytorch' && inputs.run_multi_node_test == true) || | |
| (matrix.test_name == 'test_multi_node_rccl' && inputs.run_rccl_test == true) | |
| )) | |
| }} | |
| run: | | |
| python3 -m pip install --upgrade pip | |
| pip install -r tests/enroot/requirements.txt | |
| - name: Create testbed file from secret | |
| if: | | |
| ${{ | |
| github.event_name == 'push' || | |
| (github.event_name == 'workflow_dispatch' && ( | |
| (matrix.test_name == 'test_single_node_pytorch' && inputs.run_single_node_test == true) || | |
| (matrix.test_name == 'test_multi_node_distributed_pytorch' && inputs.run_multi_node_test == true) || | |
| (matrix.test_name == 'test_multi_node_rccl' && inputs.run_rccl_test == true) | |
| )) | |
| }} | |
| working-directory: tests/enroot | |
| env: | |
| SINGLE_NODE_TESTBED: ${{ secrets.SINGLE_NODE_TESTBED_FILE }} | |
| MULTI_NODE_TESTBED: ${{ secrets.MULTI_NODE_TESTBED_FILE }} | |
| run: | | |
| # Create testbed files from secrets (secrets contain YAML content) | |
| mkdir -p testbed | |
| # Write single-node testbed if secret exists | |
| if [ -n "$SINGLE_NODE_TESTBED" ]; then | |
| printf '%s\n' "$SINGLE_NODE_TESTBED" > testbed/single_node_tb.yml | |
| echo "Created testbed/single_node_tb.yml from secret" | |
| else | |
| echo "[WARNING] SINGLE_NODE_TESTBED_FILE secret is not set" | |
| fi | |
| # Write multi-node testbed if secret exists | |
| if [ -n "$MULTI_NODE_TESTBED" ]; then | |
| printf '%s\n' "$MULTI_NODE_TESTBED" > testbed/multi_node_tb.yml | |
| echo "Created testbed/multi_node_tb.yml from secret" | |
| else | |
| echo "[WARNING] MULTI_NODE_TESTBED_FILE secret is not set" | |
| fi | |
| # List created testbed files for debugging | |
| echo "Testbed files created:" | |
| ls -la testbed/ || echo "No testbed directory" | |
| - name: Run enroot tests | |
| if: | | |
| ${{ | |
| github.event_name == 'push' || | |
| (github.event_name == 'workflow_dispatch' && ( | |
| (matrix.test_name == 'test_single_node_pytorch' && inputs.run_single_node_test == true) || | |
| (matrix.test_name == 'test_multi_node_distributed_pytorch' && inputs.run_multi_node_test == true) || | |
| (matrix.test_name == 'test_multi_node_rccl' && inputs.run_rccl_test == true) | |
| )) | |
| }} | |
| working-directory: tests/enroot | |
| run: | | |
| # Use matrix test_name for the test to run | |
| TEST_NAME="${{ matrix.test_name }}" | |
| # Determine testbed file and docker image based on test type and event | |
| if [ "${{ github.event_name }}" = "push" ]; then | |
| # For push events: use test-type-specific testbed files and default images from batch scripts | |
| if [ "$TEST_NAME" = "test_single_node_pytorch" ]; then | |
| TESTBED_FILE="testbed/single_node_tb.yml" | |
| else | |
| TESTBED_FILE="testbed/multi_node_tb.yml" | |
| fi | |
| DOCKER_IMAGE="" | |
| NO_INSTALL="false" | |
| NO_UNINSTALL="false" | |
| else | |
| # For workflow_dispatch: use inputs | |
| if [ -n "${{ inputs.testbed_file }}" ]; then | |
| TESTBED_FILE="${{ inputs.testbed_file }}" | |
| else | |
| if [ "$TEST_NAME" = "test_single_node_pytorch" ]; then | |
| TESTBED_FILE="testbed/single_node_tb.yml" | |
| else | |
| TESTBED_FILE="testbed/multi_node_tb.yml" | |
| fi | |
| fi | |
| NO_INSTALL="${{ inputs.no_install }}" | |
| NO_UNINSTALL="${{ inputs.no_uninstall }}" | |
| # Set DOCKER_IMAGE based on test type | |
| if [ "$TEST_NAME" = "test_single_node_pytorch" ]; then | |
| DOCKER_IMAGE="${{ inputs.base_image_single_node }}" | |
| elif [ "$TEST_NAME" = "test_multi_node_distributed_pytorch" ]; then | |
| DOCKER_IMAGE="${{ inputs.base_image_multi_node }}" | |
| elif [ "$TEST_NAME" = "test_multi_node_rccl" ]; then | |
| DOCKER_IMAGE="${{ inputs.base_image_rccl }}" | |
| fi | |
| fi | |
| # Validate testbed file exists | |
| if [ ! -f "$TESTBED_FILE" ]; then | |
| echo "[ERROR] Testbed file not found: $TESTBED_FILE" | |
| echo "Please ensure the appropriate secret is set:" | |
| echo " - SINGLE_NODE_TESTBED_FILE for single-node tests" | |
| echo " - MULTI_NODE_TESTBED_FILE for multi-node tests" | |
| echo "Or provide a custom testbed_file input via workflow_dispatch." | |
| exit 1 | |
| fi | |
| echo "Using testbed file: $TESTBED_FILE" | |
| # Run RCCL test differently (pytest directly) | |
| if [ "$TEST_NAME" = "test_multi_node_rccl" ]; then | |
| # For RCCL test: extract version tag from docker image if provided | |
| if [ -n "$DOCKER_IMAGE" ]; then | |
| # Extract version tag from full docker image path | |
| # Example: docker://rocm/roce-workload:ubuntu24_rocm-7.0.2_rccl-7.0.2_anp-v1.2.0_ainic-1.117.5-a-56 | |
| # Result: ubuntu24_rocm-7.0.2_rccl-7.0.2_anp-v1.2.0_ainic-1.117.5-a-56 | |
| DOCKER_IMAGE_VERSION=$(echo "$DOCKER_IMAGE" | sed 's/.*://') | |
| export DOCKER_IMAGE_VERSION | |
| echo "Using RCCL Docker image version: $DOCKER_IMAGE_VERSION" | |
| fi | |
| # Convert testbed file to absolute path before changing directory | |
| TESTBED_FILE_ABS="$(pwd)/$TESTBED_FILE" | |
| # Set PYTHONPATH and cd to testsuites directory for pytest | |
| export PYTHONPATH=$(pwd):$PYTHONPATH | |
| cd testsuites | |
| python3 -m pytest test_enroot.py --testbed "$TESTBED_FILE_ABS" -k test_multi_node_rccl --no-install --no-uninstall | |
| else | |
| # For other tests: use run_test.py | |
| python3 run_test.py "$TEST_NAME" "$DOCKER_IMAGE" "$NO_INSTALL" "$NO_UNINSTALL" "$TESTBED_FILE" | |
| fi | |
| - name: Upload test results | |
| if: | | |
| ${{ | |
| always() && ( | |
| github.event_name == 'push' || | |
| (github.event_name == 'workflow_dispatch' && ( | |
| (matrix.test_name == 'test_single_node_pytorch' && inputs.run_single_node_test == true) || | |
| (matrix.test_name == 'test_multi_node_distributed_pytorch' && inputs.run_multi_node_test == true) || | |
| (matrix.test_name == 'test_multi_node_rccl' && inputs.run_rccl_test == true) | |
| )) | |
| ) | |
| }} | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: test-results-${{ matrix.test_name }}-${{ github.run_number }} | |
| path: tests/enroot/results/ | |
| if-no-files-found: warn | |
| retention-days: 30 |