Skip to content

GPU Test

GPU Test #175

Workflow file for this run

name: GPU Test
permissions:
contents: read
on:
schedule:
# Every day at 3 AM UTC+8
- cron: '0 19 * * *'
workflow_dispatch:
jobs:
examples:
runs-on: [self-hosted, linux, gpu]
timeout-minutes: 60
strategy:
matrix:
setup: [stable, latest]
fail-fast: false
container:
image: ghcr.io/microsoft/agent-lightning/base:latest
options: --gpus all --ipc=host --interactive --tty
steps:
- name: Check GPU status
run: nvidia-smi
- uses: actions/checkout@v4
- name: Create a virtual environment
run: python3 -m venv .venv
- name: Install deps inside the container (${{ matrix.setup }})
run: |
. .venv/bin/activate
./scripts/setup_${{ matrix.setup }}_gpu.sh
- name: Freeze dependencies
run: |
. .venv/bin/activate
which python
which pip
which uvx
pip list | tee requirements-freeze.txt
- name: Upload dependencies artifact
uses: actions/upload-artifact@v4
with:
name: dependencies-${{ matrix.setup }}
path: requirements-freeze.txt
compression-level: 0
- name: Prepare Spider dataset
run: |
set -ex
. .venv/bin/activate
cd examples/spider
gdown --fuzzy https://drive.google.com/file/d/1oi9J1jZP9TyM35L85CL3qeGWl2jqlnL6/view
unzip -q spider-data.zip -d data
rm spider-data.zip
- name: Prepare Calc-X dataset
run: |
set -ex
. .venv/bin/activate
cd examples/calc_x
gdown --fuzzy https://drive.google.com/file/d/1FQMyKLLd6hP9dw9rfZn1EZOWNvKaDsqw/view
unzip calc-x-data.zip -d data
rm calc-x-data.zip
- name: Spider sanity check
run: |
set -ex
. .venv/bin/activate
cd examples/spider
python sql_agent.py --trainer.n-workers 1 --trainer.dev true --trainer.max-tasks 2
env:
VERL_API_BASE: http://localhost:9999/
OPENAI_API_BASE: ${{ secrets.OPENAI_API_BASE }}
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
- name: Calc-X MCP sanity check
run: |
set -ex
. .venv/bin/activate
cd examples/calc_x
python tests/test_mcp_calculator.py
env:
OPENAI_API_BASE: ${{ secrets.OPENAI_API_BASE }}
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
- name: Calc-X sanity check
run: |
set -ex
. .venv/bin/activate
cd examples/calc_x
python calc_agent_dev.py
env:
OPENAI_API_BASE: ${{ secrets.OPENAI_API_BASE }}
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
# Calc-X training suddenly works after running the sanity check.
# And it has to be run before Spider training.
# The client side used to hang in many of my attempts.
# Don't ask why. Don't touch this.
- name: Calc-X training
run: |
set -ex
source .venv/bin/activate
cd examples/calc_x
../../scripts/restart_ray.sh
sleep 5
PYTHONUNBUFFERED=1 python calc_agent.py &
bash train_ci.sh
pkill -f calc_agent.py && echo "SIGTERM sent to calc_agent.py" || echo "No calc_agent.py process found"
while pgrep -f calc_agent.py; do
echo "Waiting for calc_agent.py to finish..."
sleep 5
done
echo "calc_agent.py has finished."
sleep 10
shell: bash
env:
WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
id: calc_x_train
- name: Validate Calc-X training
run: |
set -ex
. .venv/bin/activate
python scripts/validate_example_wandb.py ${{ steps.calc_x_train.outputs.project_name }} ${{ steps.calc_x_train.outputs.run_name }}
env:
WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
- name: Spider training
run: |
set -ex
source .venv/bin/activate
cd examples/spider
../../scripts/restart_ray.sh
sleep 5
PYTHONUNBUFFERED=1 python sql_agent.py --trainer.n-workers 10 &
bash train_ci.sh
pkill -f sql_agent.py && echo "SIGTERM sent to sql_agent.py" || echo "No sql_agent.py process found"
while pgrep -f sql_agent.py; do
echo "Waiting for sql_agent.py to finish..."
sleep 5
done
echo "sql_agent.py has finished."
sleep 10
shell: bash
env:
VERL_API_BASE: http://localhost:9991/
WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
id: spider_train
if: success() || failure()
- name: Validate Spider training
run: |
set -ex
. .venv/bin/activate
python scripts/validate_example_wandb.py ${{ steps.spider_train.outputs.project_name }} ${{ steps.spider_train.outputs.run_name }}
env:
WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
- name: Cleanup
run: ./scripts/cleanup.sh
if: success() || failure()