Skip to content

GPU Test - schedule #356

GPU Test - schedule

GPU Test - schedule #356

Workflow file for this run

name: GPU Test
permissions:
contents: read
on:
schedule:
# Every day at 5 AM UTC+8
- cron: '0 21 * * *'
workflow_dispatch:
repository_dispatch:
types: [ci-gpu, ci-all]
run-name: >-
${{ github.event_name == 'repository_dispatch'
&& format(
'GPU Test - PR #{0} - {1} - {2}',
github.event.client_payload.pull_number,
github.event.client_payload.ci_label,
github.event.client_payload.correlation_id
)
|| format('GPU Test - {0}', github.event_name) }}
jobs:
tests-full:
if: >
github.event_name != 'repository_dispatch' ||
github.event.action == 'ci-gpu' ||
github.event.action == 'ci-all'
name: Full Test (${{ matrix.mark.display-name }}, ${{ matrix.env.setup-script }}, Python ${{ matrix.env.python-version }})
runs-on: ${{ matrix.mark.runs-on }}
timeout-minutes: 30
strategy:
matrix:
mark:
- id: store
display-name: Store
pytest-mark: 'store' # store tests should not require gpu
runs-on: ubuntu-latest
has-gpu: false
# AgentOps needs to be separated because it injects tricky global state.
- id: agentops
display-name: AgentOps
pytest-mark: 'agentops' # including agentops+litellm tests here
runs-on: [self-hosted, 1ES.Pool=agl-runner-gpu]
has-gpu: true
# Similar for Weave.
- id: weave
display-name: Weave
pytest-mark: 'weave'
runs-on: ubuntu-latest # No GPU tests for Weave.
has-gpu: false
# Other tests that require GPU
- id: gpu
display-name: GPU required
pytest-mark: '(gpu or llmproxy) and not agentops'
runs-on: [self-hosted, 1ES.Pool=agl-runner-gpu]
has-gpu: true
# Other uncovered tests
- id: others
display-name: Others
pytest-mark: 'not store and not agentops and not weave and not gpu and not llmproxy'
runs-on: ubuntu-latest
has-gpu: false
env:
- python-version: '3.10'
setup-script: 'legacy'
- python-version: '3.12'
setup-script: 'stable'
- python-version: '3.13'
setup-script: 'latest'
fail-fast: false
steps:
- name: Check GPU status
if: matrix.mark.has-gpu
run: nvidia-smi
- uses: actions/checkout@v6
with:
ref: ${{ github.event_name == 'repository_dispatch' && github.event.client_payload.pr_ref || (github.event.pull_request.number && format('refs/pull/{0}/merge', github.event.pull_request.number)) || github.ref }}
- uses: astral-sh/setup-uv@v7
with:
enable-cache: true
python-version: ${{ matrix.env.python-version }}
- name: Upgrade dependencies (latest)
run: uv lock --upgrade
if: matrix.env.setup-script == 'latest'
- name: Sync dependencies (latest, gpu)
if: matrix.env.setup-script == 'latest' && matrix.mark.has-gpu
run: uv sync --frozen --no-default-groups --extra apo --extra weave --extra mongo --group dev --group agents --group langchain --group torch-gpu-stable
# Don't install vllm/pytorch on CPU counterparts
- name: Sync dependencies (latest, cpu)
if: matrix.env.setup-script == 'latest' && !matrix.mark.has-gpu
run: uv sync --frozen --no-default-groups --extra apo --extra weave --extra mongo --group dev --group agents --group langchain --group core-stable
- name: Sync dependencies (stable, gpu)
if: matrix.env.setup-script == 'stable' && matrix.mark.has-gpu
run: uv sync --frozen --no-default-groups --extra apo --extra weave --extra mongo --group dev --group agents --group langchain --group torch-gpu-${{ matrix.env.setup-script }}
- name: Sync dependencies (stable, cpu)
if: matrix.env.setup-script == 'stable' && !matrix.mark.has-gpu
run: uv sync --frozen --no-default-groups --extra apo --extra weave --extra mongo --group dev --group agents --group langchain --group core-stable
# Don't install langchain for legacy dependency because it has conflicts with torch.
- name: Sync dependencies (legacy, gpu)
if: matrix.env.setup-script == 'legacy' && matrix.mark.has-gpu
run: uv sync --frozen --no-default-groups --extra apo --extra weave --extra mongo --group dev --group agents --group torch-gpu-legacy
- name: Sync dependencies (legacy, cpu)
if: matrix.env.setup-script == 'legacy' && !matrix.mark.has-gpu
run: uv sync --frozen --no-default-groups --extra apo --extra weave --extra mongo --group dev --group agents --group core-legacy
- name: Freeze dependencies
run: |
set -ex
uv pip freeze | tee requirements-freeze.txt
echo "UV_LOCKED=1" >> $GITHUB_ENV
echo "UV_NO_SYNC=1" >> $GITHUB_ENV
- name: Upload dependencies artifact
uses: actions/upload-artifact@v6
with:
name: dependencies-tests-full-${{ matrix.mark.id }}-${{ matrix.env.python-version }}-${{ matrix.env.setup-script }}
path: requirements-freeze.txt
compression-level: 0
- uses: actions/setup-node@v6
with:
node-version: '22'
cache: 'npm'
cache-dependency-path: dashboard/package-lock.json
- name: Install JavaScript dependencies
run: cd dashboard && npm ci
- name: Build dashboard
run: cd dashboard && npm run build
- name: Setup Docker environments
run: ./scripts/mongodb_docker_run.sh
shell: bash
- name: Launch LiteLLM Proxy
run: |
./scripts/litellm_run.sh
env:
AZURE_API_BASE: ${{ secrets.AZURE_GROUP_SUBSCRIPTION_API_BASE }}
AZURE_API_KEY: ${{ secrets.AZURE_GROUP_SUBSCRIPTION_API_KEY }}
# mongo, openai, gpu, all enabled by default
- name: Run tests
run: |
uv run pytest -v --durations=0 tests -m "${{ matrix.mark.pytest-mark }}${{ matrix.env.setup-script == 'legacy' && ' and not langchain' || '' }}"
env:
PYTEST_ADDOPTS: "--color=yes"
OPENAI_BASE_URL: http://localhost:12306/
OPENAI_API_KEY: dummy
AGL_TEST_MONGO_URI: mongodb://localhost:27017/?replicaSet=rs0
minimal-examples:
if: >
github.event_name != 'repository_dispatch' ||
github.event.action == 'ci-gpu' ||
github.event.action == 'ci-all'
name: Minimal Examples with Python ${{ matrix.python-version }} (${{ matrix.setup-script }})
runs-on: [self-hosted, 1ES.Pool=agl-runner-gpu]
timeout-minutes: 30
strategy:
matrix:
include:
- python-version: '3.10'
setup-script: 'legacy'
- python-version: '3.12'
setup-script: 'stable'
- python-version: '3.13'
setup-script: 'latest'
fail-fast: false
steps:
- name: Check GPU status
run: nvidia-smi
- uses: actions/checkout@v6
with:
ref: ${{ github.event_name == 'repository_dispatch' && github.event.client_payload.pr_ref || (github.event.pull_request.number && format('refs/pull/{0}/merge', github.event.pull_request.number)) || github.ref }}
- uses: astral-sh/setup-uv@v7
with:
enable-cache: true
python-version: ${{ matrix.python-version }}
- name: Upgrade dependencies (latest)
run: uv lock --upgrade
if: matrix.setup-script == 'latest'
- name: Sync dependencies (latest)
run: uv sync --frozen --no-default-groups --extra apo --group dev --group agents --group langchain --group torch-gpu-stable
if: matrix.setup-script == 'latest'
- name: Sync dependencies (stable)
run: uv sync --frozen --no-default-groups --extra apo --extra mongo --group dev --group agents --group langchain --group torch-gpu-${{ matrix.setup-script }}
if: matrix.setup-script == 'stable'
# Don't install langchain for legacy dependency because it has conflicts with torch.
- name: Sync dependencies (legacy)
run: uv sync --frozen --no-default-groups --extra apo --extra mongo --group dev --group agents --group torch-gpu-legacy
if: matrix.setup-script == 'legacy'
- name: Freeze dependencies
run: |
set -ex
uv pip freeze | tee requirements-freeze.txt
echo "UV_LOCKED=1" >> $GITHUB_ENV
echo "UV_NO_SYNC=1" >> $GITHUB_ENV
- name: Upload dependencies artifact
uses: actions/upload-artifact@v6
with:
name: dependencies-minimal-examples-${{ matrix.python-version }}-${{ matrix.setup-script }}
path: requirements-freeze.txt
compression-level: 0
- name: Launch LiteLLM Proxy
run: |
./scripts/litellm_run.sh
env:
AZURE_API_BASE: ${{ secrets.AZURE_GROUP_SUBSCRIPTION_API_BASE }}
AZURE_API_KEY: ${{ secrets.AZURE_GROUP_SUBSCRIPTION_API_KEY }}
- name: Write Traces via Otel Tracer
run: |
set -euo pipefail
source .venv/bin/activate
cd examples/minimal
python write_traces.py otel
sleep 5
- name: Write Traces via AgentOps Tracer
env:
OPENAI_BASE_URL: http://localhost:12306/
OPENAI_API_KEY: dummy
run: |
set -euo pipefail
source .venv/bin/activate
cd examples/minimal
python write_traces.py agentops
sleep 5
- name: Write Traces with Operations
run: |
set -euo pipefail
source .venv/bin/activate
cd examples/minimal
python write_traces.py operation
sleep 5
- name: Write Traces via Otel Tracer with Client
run: |
set -euo pipefail
source .venv/bin/activate
cd examples/minimal
agl store --port 45993 --log-level DEBUG &
sleep 5
python write_traces.py otel --use-client
pkill -f agl && echo "SIGTERM sent to agl" || echo "No agl process found"
while pgrep -f agl; do
echo "Waiting for agl to finish..."
sleep 5
done
- name: Write Traces via AgentOps Tracer with Client
env:
OPENAI_BASE_URL: http://localhost:12306/
OPENAI_API_KEY: dummy
run: |
set -euo pipefail
source .venv/bin/activate
cd examples/minimal
agl store --port 45993 --log-level DEBUG &
sleep 5
python write_traces.py agentops --use-client
pkill -f agl && echo "SIGTERM sent to agl" || echo "No agl process found"
while pgrep -f agl; do
echo "Waiting for agl to finish..."
sleep 5
done
- name: vLLM Server
run: |
set -euo pipefail
source .venv/bin/activate
cd examples/minimal
python vllm_server.py Qwen/Qwen2.5-0.5B-Instruct
- name: LLM Proxy (OpenAI backend)
env:
OPENAI_API_BASE: http://localhost:12306/
OPENAI_API_KEY: dummy
run: |
set -euo pipefail
source .venv/bin/activate
cd examples/minimal
python llm_proxy.py openai gpt-4.1-mini &
LLM_PROXY_READY=0
for attempt in $(seq 1 30); do
if curl -sSf http://localhost:43886/health > /dev/null 2>&1; then
LLM_PROXY_READY=1
break
fi
sleep 2
done
if [[ "$LLM_PROXY_READY" != "1" ]]; then
echo "LLM proxy failed to become healthy" >&2
exit 1
fi
python llm_proxy.py test gpt-4.1-mini
pkill -f llm_proxy.py && echo "SIGTERM sent to llm_proxy.py" || echo "No llm_proxy.py process found"
while pgrep -f llm_proxy.py; do
echo "Waiting for llm_proxy.py to finish..."
sleep 5
done
- name: LLM Proxy (vLLM backend)
if: matrix.setup-script != 'legacy' # Skip if return_token_ids is not supported
run: |
set -euo pipefail
source .venv/bin/activate
cd examples/minimal
python llm_proxy.py vllm Qwen/Qwen2.5-0.5B-Instruct &
LLM_PROXY_READY=0
for attempt in $(seq 1 30); do
if curl -sSf http://localhost:43886/health > /dev/null 2>&1; then
LLM_PROXY_READY=1
break
fi
sleep 2
done
if [[ "$LLM_PROXY_READY" != "1" ]]; then
echo "LLM proxy failed to become healthy" >&2
exit 1
fi
python llm_proxy.py test Qwen/Qwen2.5-0.5B-Instruct
pkill -f llm_proxy.py && echo "SIGTERM sent to llm_proxy.py" || echo "No llm_proxy.py process found"
while pgrep -f llm_proxy.py; do
echo "Waiting for llm_proxy.py to finish..."
sleep 5
done
- name: MultiMetrics backend example
run: |
set -euo pipefail
source .venv/bin/activate
cd examples/minimal
python write_metrics.py --duration 8 --prom-port 9105 --prom-host 0.0.0.0 2>&1 | tee metrics.log &
pid=$!
for attempt in $(seq 1 20); do
if curl -sSf http://localhost:9105/metrics | grep -q minimal_requests_total; then
echo "Metrics endpoint responding"
wait $pid
cat metrics.log
exit 0
fi
sleep 1
done
echo "Metrics endpoint did not respond"
exit 1