GPU E2E Training Test #9
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: GPU E2E Training Test | |
| on: | |
| workflow_dispatch: | |
| inputs: | |
| instance_type: | |
| description: 'Lambda instance type(s), comma-separated for fallback' | |
| default: 'gpu_1x_a10,gpu_1x_a100,gpu_1x_h100_pcie' | |
| type: string | |
| region: | |
| description: 'Lambda region(s), comma-separated for fallback' | |
| default: 'us-south-1,us-west-1,us-east-1' | |
| type: string | |
| epochs: | |
| description: 'Number of training epochs' | |
| default: '5' | |
| type: string | |
| debug: | |
| description: 'Debug mode (false/true/N minutes)' | |
| default: 'false' | |
| type: string | |
| jobs: | |
| lambda: | |
| uses: Open-Athena/lambda-gha/.github/workflows/runner.yml@main | |
| secrets: | |
| GH_SA_TOKEN: ${{ secrets.GH_SA_TOKEN }} | |
| LAMBDA_API_KEY: ${{ secrets.LAMBDA_API_KEY }} | |
| LAMBDA_SSH_PRIVATE_KEY: ${{ secrets.LAMBDA_SSH_PRIVATE_KEY }} | |
| with: | |
| instance_type: ${{ inputs.instance_type || 'gpu_1x_a10,gpu_1x_a100,gpu_1x_h100_pcie' }} | |
| region: ${{ inputs.region || 'us-south-1,us-west-1,us-east-1' }} | |
| debug: ${{ inputs.debug || 'false' }} | |
| gpu-test: | |
| needs: lambda | |
| runs-on: ${{ needs.lambda.outputs.id }} | |
| steps: | |
| - name: Checkout | |
| uses: actions/checkout@v4 | |
| - name: Check GPU | |
| run: nvidia-smi | |
| - name: Install uv | |
| uses: astral-sh/setup-uv@v5 | |
| - name: Install dependencies | |
| run: uv sync | |
| - name: Run e2e training test (GPU) | |
| env: | |
| EPOCHS: ${{ inputs.epochs || '5' }} | |
| run: | | |
| # Run with GPU acceleration | |
| uv run python tests/e2e_train.py \ | |
| --gpu \ | |
| --epochs "$EPOCHS" \ | |
| --verbose \ | |
| --no-check # Don't check against CPU-expected loss (GPU may differ slightly) | |
| - name: Run e2e training test (CPU baseline) | |
| run: | | |
| # Run CPU test to verify determinism | |
| uv run python tests/e2e_train.py --epochs 5 | |
| - name: Show GPU memory usage | |
| if: always() | |
| run: nvidia-smi |