GPU Test - schedule #356
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: GPU Test | |
| permissions: | |
| contents: read | |
| on: | |
| schedule: | |
| # Every day at 5 AM UTC+8 | |
| - cron: '0 21 * * *' | |
| workflow_dispatch: | |
| repository_dispatch: | |
| types: [ci-gpu, ci-all] | |
| run-name: >- | |
| ${{ github.event_name == 'repository_dispatch' | |
| && format( | |
| 'GPU Test - PR #{0} - {1} - {2}', | |
| github.event.client_payload.pull_number, | |
| github.event.client_payload.ci_label, | |
| github.event.client_payload.correlation_id | |
| ) | |
| || format('GPU Test - {0}', github.event_name) }} | |
| jobs: | |
| tests-full: | |
| if: > | |
| github.event_name != 'repository_dispatch' || | |
| github.event.action == 'ci-gpu' || | |
| github.event.action == 'ci-all' | |
| name: Full Test (${{ matrix.mark.display-name }}, ${{ matrix.env.setup-script }}, Python ${{ matrix.env.python-version }}) | |
| runs-on: ${{ matrix.mark.runs-on }} | |
| timeout-minutes: 30 | |
| strategy: | |
| matrix: | |
| mark: | |
| - id: store | |
| display-name: Store | |
| pytest-mark: 'store' # store tests should not require gpu | |
| runs-on: ubuntu-latest | |
| has-gpu: false | |
| # AgentOps needs to be separated because it injects tricky global state. | |
| - id: agentops | |
| display-name: AgentOps | |
| pytest-mark: 'agentops' # including agentops+litellm tests here | |
| runs-on: [self-hosted, 1ES.Pool=agl-runner-gpu] | |
| has-gpu: true | |
| # Similar for Weave. | |
| - id: weave | |
| display-name: Weave | |
| pytest-mark: 'weave' | |
| runs-on: ubuntu-latest # No GPU tests for Weave. | |
| has-gpu: false | |
| # Other tests that require GPU | |
| - id: gpu | |
| display-name: GPU required | |
| pytest-mark: '(gpu or llmproxy) and not agentops' | |
| runs-on: [self-hosted, 1ES.Pool=agl-runner-gpu] | |
| has-gpu: true | |
| # Other uncovered tests | |
| - id: others | |
| display-name: Others | |
| pytest-mark: 'not store and not agentops and not weave and not gpu and not llmproxy' | |
| runs-on: ubuntu-latest | |
| has-gpu: false | |
| env: | |
| - python-version: '3.10' | |
| setup-script: 'legacy' | |
| - python-version: '3.12' | |
| setup-script: 'stable' | |
| - python-version: '3.13' | |
| setup-script: 'latest' | |
| fail-fast: false | |
| steps: | |
| - name: Check GPU status | |
| if: matrix.mark.has-gpu | |
| run: nvidia-smi | |
| - uses: actions/checkout@v6 | |
| with: | |
| ref: ${{ github.event_name == 'repository_dispatch' && github.event.client_payload.pr_ref || (github.event.pull_request.number && format('refs/pull/{0}/merge', github.event.pull_request.number)) || github.ref }} | |
| - uses: astral-sh/setup-uv@v7 | |
| with: | |
| enable-cache: true | |
| python-version: ${{ matrix.env.python-version }} | |
| - name: Upgrade dependencies (latest) | |
| run: uv lock --upgrade | |
| if: matrix.env.setup-script == 'latest' | |
| - name: Sync dependencies (latest, gpu) | |
| if: matrix.env.setup-script == 'latest' && matrix.mark.has-gpu | |
| run: uv sync --frozen --no-default-groups --extra apo --extra weave --extra mongo --group dev --group agents --group langchain --group torch-gpu-stable | |
| # Don't install vllm/pytorch on CPU counterparts | |
| - name: Sync dependencies (latest, cpu) | |
| if: matrix.env.setup-script == 'latest' && !matrix.mark.has-gpu | |
| run: uv sync --frozen --no-default-groups --extra apo --extra weave --extra mongo --group dev --group agents --group langchain --group core-stable | |
| - name: Sync dependencies (stable, gpu) | |
| if: matrix.env.setup-script == 'stable' && matrix.mark.has-gpu | |
| run: uv sync --frozen --no-default-groups --extra apo --extra weave --extra mongo --group dev --group agents --group langchain --group torch-gpu-${{ matrix.env.setup-script }} | |
| - name: Sync dependencies (stable, cpu) | |
| if: matrix.env.setup-script == 'stable' && !matrix.mark.has-gpu | |
| run: uv sync --frozen --no-default-groups --extra apo --extra weave --extra mongo --group dev --group agents --group langchain --group core-stable | |
| # Don't install langchain for legacy dependency because it has conflicts with torch. | |
| - name: Sync dependencies (legacy, gpu) | |
| if: matrix.env.setup-script == 'legacy' && matrix.mark.has-gpu | |
| run: uv sync --frozen --no-default-groups --extra apo --extra weave --extra mongo --group dev --group agents --group torch-gpu-legacy | |
| - name: Sync dependencies (legacy, cpu) | |
| if: matrix.env.setup-script == 'legacy' && !matrix.mark.has-gpu | |
| run: uv sync --frozen --no-default-groups --extra apo --extra weave --extra mongo --group dev --group agents --group core-legacy | |
| - name: Freeze dependencies | |
| run: | | |
| set -ex | |
| uv pip freeze | tee requirements-freeze.txt | |
| echo "UV_LOCKED=1" >> $GITHUB_ENV | |
| echo "UV_NO_SYNC=1" >> $GITHUB_ENV | |
| - name: Upload dependencies artifact | |
| uses: actions/upload-artifact@v6 | |
| with: | |
| name: dependencies-tests-full-${{ matrix.mark.id }}-${{ matrix.env.python-version }}-${{ matrix.env.setup-script }} | |
| path: requirements-freeze.txt | |
| compression-level: 0 | |
| - uses: actions/setup-node@v6 | |
| with: | |
| node-version: '22' | |
| cache: 'npm' | |
| cache-dependency-path: dashboard/package-lock.json | |
| - name: Install JavaScript dependencies | |
| run: cd dashboard && npm ci | |
| - name: Build dashboard | |
| run: cd dashboard && npm run build | |
| - name: Setup Docker environments | |
| run: ./scripts/mongodb_docker_run.sh | |
| shell: bash | |
| - name: Launch LiteLLM Proxy | |
| run: | | |
| ./scripts/litellm_run.sh | |
| env: | |
| AZURE_API_BASE: ${{ secrets.AZURE_GROUP_SUBSCRIPTION_API_BASE }} | |
| AZURE_API_KEY: ${{ secrets.AZURE_GROUP_SUBSCRIPTION_API_KEY }} | |
| # mongo, openai, gpu, all enabled by default | |
| - name: Run tests | |
| run: | | |
| uv run pytest -v --durations=0 tests -m "${{ matrix.mark.pytest-mark }}${{ matrix.env.setup-script == 'legacy' && ' and not langchain' || '' }}" | |
| env: | |
| PYTEST_ADDOPTS: "--color=yes" | |
| OPENAI_BASE_URL: http://localhost:12306/ | |
| OPENAI_API_KEY: dummy | |
| AGL_TEST_MONGO_URI: mongodb://localhost:27017/?replicaSet=rs0 | |
| minimal-examples: | |
| if: > | |
| github.event_name != 'repository_dispatch' || | |
| github.event.action == 'ci-gpu' || | |
| github.event.action == 'ci-all' | |
| name: Minimal Examples with Python ${{ matrix.python-version }} (${{ matrix.setup-script }}) | |
| runs-on: [self-hosted, 1ES.Pool=agl-runner-gpu] | |
| timeout-minutes: 30 | |
| strategy: | |
| matrix: | |
| include: | |
| - python-version: '3.10' | |
| setup-script: 'legacy' | |
| - python-version: '3.12' | |
| setup-script: 'stable' | |
| - python-version: '3.13' | |
| setup-script: 'latest' | |
| fail-fast: false | |
| steps: | |
| - name: Check GPU status | |
| run: nvidia-smi | |
| - uses: actions/checkout@v6 | |
| with: | |
| ref: ${{ github.event_name == 'repository_dispatch' && github.event.client_payload.pr_ref || (github.event.pull_request.number && format('refs/pull/{0}/merge', github.event.pull_request.number)) || github.ref }} | |
| - uses: astral-sh/setup-uv@v7 | |
| with: | |
| enable-cache: true | |
| python-version: ${{ matrix.python-version }} | |
| - name: Upgrade dependencies (latest) | |
| run: uv lock --upgrade | |
| if: matrix.setup-script == 'latest' | |
| - name: Sync dependencies (latest) | |
| run: uv sync --frozen --no-default-groups --extra apo --group dev --group agents --group langchain --group torch-gpu-stable | |
| if: matrix.setup-script == 'latest' | |
| - name: Sync dependencies (stable) | |
| run: uv sync --frozen --no-default-groups --extra apo --extra mongo --group dev --group agents --group langchain --group torch-gpu-${{ matrix.setup-script }} | |
| if: matrix.setup-script == 'stable' | |
| # Don't install langchain for legacy dependency because it has conflicts with torch. | |
| - name: Sync dependencies (legacy) | |
| run: uv sync --frozen --no-default-groups --extra apo --extra mongo --group dev --group agents --group torch-gpu-legacy | |
| if: matrix.setup-script == 'legacy' | |
| - name: Freeze dependencies | |
| run: | | |
| set -ex | |
| uv pip freeze | tee requirements-freeze.txt | |
| echo "UV_LOCKED=1" >> $GITHUB_ENV | |
| echo "UV_NO_SYNC=1" >> $GITHUB_ENV | |
| - name: Upload dependencies artifact | |
| uses: actions/upload-artifact@v6 | |
| with: | |
| name: dependencies-minimal-examples-${{ matrix.python-version }}-${{ matrix.setup-script }} | |
| path: requirements-freeze.txt | |
| compression-level: 0 | |
| - name: Launch LiteLLM Proxy | |
| run: | | |
| ./scripts/litellm_run.sh | |
| env: | |
| AZURE_API_BASE: ${{ secrets.AZURE_GROUP_SUBSCRIPTION_API_BASE }} | |
| AZURE_API_KEY: ${{ secrets.AZURE_GROUP_SUBSCRIPTION_API_KEY }} | |
| - name: Write Traces via Otel Tracer | |
| run: | | |
| set -euo pipefail | |
| source .venv/bin/activate | |
| cd examples/minimal | |
| python write_traces.py otel | |
| sleep 5 | |
| - name: Write Traces via AgentOps Tracer | |
| env: | |
| OPENAI_BASE_URL: http://localhost:12306/ | |
| OPENAI_API_KEY: dummy | |
| run: | | |
| set -euo pipefail | |
| source .venv/bin/activate | |
| cd examples/minimal | |
| python write_traces.py agentops | |
| sleep 5 | |
| - name: Write Traces with Operations | |
| run: | | |
| set -euo pipefail | |
| source .venv/bin/activate | |
| cd examples/minimal | |
| python write_traces.py operation | |
| sleep 5 | |
| - name: Write Traces via Otel Tracer with Client | |
| run: | | |
| set -euo pipefail | |
| source .venv/bin/activate | |
| cd examples/minimal | |
| agl store --port 45993 --log-level DEBUG & | |
| sleep 5 | |
| python write_traces.py otel --use-client | |
| pkill -f agl && echo "SIGTERM sent to agl" || echo "No agl process found" | |
| while pgrep -f agl; do | |
| echo "Waiting for agl to finish..." | |
| sleep 5 | |
| done | |
| - name: Write Traces via AgentOps Tracer with Client | |
| env: | |
| OPENAI_BASE_URL: http://localhost:12306/ | |
| OPENAI_API_KEY: dummy | |
| run: | | |
| set -euo pipefail | |
| source .venv/bin/activate | |
| cd examples/minimal | |
| agl store --port 45993 --log-level DEBUG & | |
| sleep 5 | |
| python write_traces.py agentops --use-client | |
| pkill -f agl && echo "SIGTERM sent to agl" || echo "No agl process found" | |
| while pgrep -f agl; do | |
| echo "Waiting for agl to finish..." | |
| sleep 5 | |
| done | |
| - name: vLLM Server | |
| run: | | |
| set -euo pipefail | |
| source .venv/bin/activate | |
| cd examples/minimal | |
| python vllm_server.py Qwen/Qwen2.5-0.5B-Instruct | |
| - name: LLM Proxy (OpenAI backend) | |
| env: | |
| OPENAI_API_BASE: http://localhost:12306/ | |
| OPENAI_API_KEY: dummy | |
| run: | | |
| set -euo pipefail | |
| source .venv/bin/activate | |
| cd examples/minimal | |
| python llm_proxy.py openai gpt-4.1-mini & | |
| LLM_PROXY_READY=0 | |
| for attempt in $(seq 1 30); do | |
| if curl -sSf http://localhost:43886/health > /dev/null 2>&1; then | |
| LLM_PROXY_READY=1 | |
| break | |
| fi | |
| sleep 2 | |
| done | |
| if [[ "$LLM_PROXY_READY" != "1" ]]; then | |
| echo "LLM proxy failed to become healthy" >&2 | |
| exit 1 | |
| fi | |
| python llm_proxy.py test gpt-4.1-mini | |
| pkill -f llm_proxy.py && echo "SIGTERM sent to llm_proxy.py" || echo "No llm_proxy.py process found" | |
| while pgrep -f llm_proxy.py; do | |
| echo "Waiting for llm_proxy.py to finish..." | |
| sleep 5 | |
| done | |
| - name: LLM Proxy (vLLM backend) | |
| if: matrix.setup-script != 'legacy' # Skip if return_token_ids is not supported | |
| run: | | |
| set -euo pipefail | |
| source .venv/bin/activate | |
| cd examples/minimal | |
| python llm_proxy.py vllm Qwen/Qwen2.5-0.5B-Instruct & | |
| LLM_PROXY_READY=0 | |
| for attempt in $(seq 1 30); do | |
| if curl -sSf http://localhost:43886/health > /dev/null 2>&1; then | |
| LLM_PROXY_READY=1 | |
| break | |
| fi | |
| sleep 2 | |
| done | |
| if [[ "$LLM_PROXY_READY" != "1" ]]; then | |
| echo "LLM proxy failed to become healthy" >&2 | |
| exit 1 | |
| fi | |
| python llm_proxy.py test Qwen/Qwen2.5-0.5B-Instruct | |
| pkill -f llm_proxy.py && echo "SIGTERM sent to llm_proxy.py" || echo "No llm_proxy.py process found" | |
| while pgrep -f llm_proxy.py; do | |
| echo "Waiting for llm_proxy.py to finish..." | |
| sleep 5 | |
| done | |
| - name: MultiMetrics backend example | |
| run: | | |
| set -euo pipefail | |
| source .venv/bin/activate | |
| cd examples/minimal | |
| python write_metrics.py --duration 8 --prom-port 9105 --prom-host 0.0.0.0 2>&1 | tee metrics.log & | |
| pid=$! | |
| for attempt in $(seq 1 20); do | |
| if curl -sSf http://localhost:9105/metrics | grep -q minimal_requests_total; then | |
| echo "Metrics endpoint responding" | |
| wait $pid | |
| cat metrics.log | |
| exit 0 | |
| fi | |
| sleep 1 | |
| done | |
| echo "Metrics endpoint did not respond" | |
| exit 1 |