Skip to content

GPU E2E Training Test #9

GPU E2E Training Test

GPU E2E Training Test #9

Workflow file for this run

name: GPU E2E Training Test
on:
workflow_dispatch:
inputs:
instance_type:
description: 'Lambda instance type(s), comma-separated for fallback'
default: 'gpu_1x_a10,gpu_1x_a100,gpu_1x_h100_pcie'
type: string
region:
description: 'Lambda region(s), comma-separated for fallback'
default: 'us-south-1,us-west-1,us-east-1'
type: string
epochs:
description: 'Number of training epochs'
default: '5'
type: string
debug:
description: 'Debug mode (false/true/N minutes)'
default: 'false'
type: string
jobs:
lambda:
uses: Open-Athena/lambda-gha/.github/workflows/runner.yml@main
secrets:
GH_SA_TOKEN: ${{ secrets.GH_SA_TOKEN }}
LAMBDA_API_KEY: ${{ secrets.LAMBDA_API_KEY }}
LAMBDA_SSH_PRIVATE_KEY: ${{ secrets.LAMBDA_SSH_PRIVATE_KEY }}
with:
instance_type: ${{ inputs.instance_type || 'gpu_1x_a10,gpu_1x_a100,gpu_1x_h100_pcie' }}
region: ${{ inputs.region || 'us-south-1,us-west-1,us-east-1' }}
debug: ${{ inputs.debug || 'false' }}
gpu-test:
needs: lambda
runs-on: ${{ needs.lambda.outputs.id }}
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Check GPU
run: nvidia-smi
- name: Install uv
uses: astral-sh/setup-uv@v5
- name: Install dependencies
run: uv sync
- name: Run e2e training test (GPU)
env:
EPOCHS: ${{ inputs.epochs || '5' }}
run: |
# Run with GPU acceleration
uv run python tests/e2e_train.py \
--gpu \
--epochs "$EPOCHS" \
--verbose \
--no-check # Don't check against CPU-expected loss (GPU may differ slightly)
- name: Run e2e training test (CPU baseline)
run: |
# Run CPU test to verify determinism
uv run python tests/e2e_train.py --epochs 5
- name: Show GPU memory usage
if: always()
run: nvidia-smi