Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 69 additions & 0 deletions .github/workflows/continuous_integration_test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
name: Continuous Integration Tests

on:
schedule:
# Runs every hour
- cron: '0 * * * *'
# TODO: remove this when merged to main
pull_request:
workflow_dispatch:

concurrency:
group: continuous-integration-${{ github.ref }}
cancel-in-progress: true

permissions:
id-token: write
contents: read

defaults:
run:
shell: bash -l -eo pipefail {0}

jobs:
integration_test:
if: github.repository_owner == 'meta-pytorch'
runs-on: linux.g5.12xlarge.nvidia.gpu
timeout-minutes: 120
strategy:
matrix:
python-version: ['3.10']
steps:
- name: Check out repo
uses: actions/checkout@v4
- name: Setup conda env
uses: conda-incubator/setup-miniconda@v2
with:
auto-update-conda: true
miniconda-version: "latest"
activate-environment: test
python-version: ${{ matrix.python-version }}
- name: Update pip
run: python -m pip install --upgrade pip
- name: Install pinned torch nightly
run: python -m pip install --pre torch==2.9.0.dev20250905 --no-cache-dir --index-url https://download.pytorch.org/whl/nightly/cu129
- name: Download and install vLLM and its dependencies
# TODO: this honestly could not be hackier if I tried
run: |
python -m pip install -r .github/packaging/vllm_reqs.txt
python -m pip install vllm==0.10.1.dev0+g6d8d0a24c.d20251009.cu129 --no-cache-dir --index-url https://download.pytorch.org/whl/preview/forge
- name: Install Monarch
run: pip install torchmonarch==0.1.0rc1
- name: Install torchtitan and torchstore
run: |
python -m pip install git+https://github.com/pytorch/torchtitan.git
python -m pip install git+https://github.com/meta-pytorch/torchstore.git
- name: Install dependencies
run: python -m pip install --no-build-isolation -e ".[dev]"
- name: Run vLLM policy correctness tests
run: |
export LD_PRELOAD=$CONDA/envs/test/lib/libpython3.10.so.1.0
export LD_LIBRARY_PATH=$CONDA/envs/test/lib/libpython3.10.so.1.0
pytest tests/integration_tests/test_vllm_policy_correctness.py --durations=20 -vv
timeout-minutes: 20
- name: Run GRPO e2e test
run: |
export LD_PRELOAD=$CONDA/envs/test/lib/libpython3.10.so.1.0
export LD_LIBRARY_PATH=$CONDA/envs/test/lib/libpython3.10.so.1.0
python tests/integration_tests/test_grpo_e2e.py
timeout-minutes: 20
127 changes: 127 additions & 0 deletions tests/integration_tests/test_grpo_e2e.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

"""
End-to-end integration test for GRPO training.

This test validates that GRPO training can run without crashes or exceptions.
Similar to TorchTitan's integration test approach, we focus on functional
correctness (no crashes) rather than numerical validation.

Usage:
python tests/integration_tests/test_grpo_e2e.py
"""

import subprocess
import sys
import time
from pathlib import Path


def run_grpo_training(
config_path: str,
max_steps: int = 5,
timeout: int = 1800,
extra_args: list[str] | None = None,
) -> subprocess.CompletedProcess:
"""
Run GRPO training and verify it completes without crashes.

Args:
config_path: Path to YAML config file
max_steps: Number of training steps to run
timeout: Maximum time in seconds to wait
extra_args: Additional CLI arguments to pass

Returns:
CompletedProcess object with stdout/stderr

Raises:
Exception: If training fails with non-zero exit code
"""
cmd = [
sys.executable,
"-m",
"apps.grpo.main",
"--config",
config_path,
f"trainer.training.steps={str(max_steps)}",
# Disable WandB for CI to avoid auth issues - only use console logging
"~metric_logging.wandb",
]

if extra_args:
cmd.extend(extra_args)

print(f"Running GRPO e2e test: {config_path}")
print(f"Command: {' '.join(cmd)}")
print(f"Max steps: {max_steps}, Timeout: {timeout}s")
print("-" * 80)

start_time = time.time()

try:
result = subprocess.run(
cmd,
timeout=timeout,
capture_output=True,
text=True,
)
except subprocess.TimeoutExpired:
elapsed = time.time() - start_time
raise Exception(
f"GRPO training timed out after {elapsed:.1f}s (timeout={timeout}s)"
) from None

elapsed = time.time() - start_time

# Print output for debugging
if result.stdout:
print("STDOUT:")
print(result.stdout[-2000:]) # Print last 2000 chars to avoid overwhelming logs

if result.stderr:
print("\nSTDERR:")
print(result.stderr[-2000:])

print("-" * 80)

# Check for success
if result.returncode != 0:
raise Exception(
f"GRPO training failed with return code {result.returncode} after {elapsed:.1f}s"
)

print(f"✓ GRPO training completed successfully in {elapsed:.1f}s")
return result


def main():
"""Run GRPO e2e test."""
print("=" * 80)
print("GRPO E2E Integration Test")
print("=" * 80)

# Test GRPO with smallest model
test_config = "apps/grpo/qwen3_1_7b.yaml"

if not Path(test_config).exists():
raise FileNotFoundError(f"Config file not found: {test_config}")

try:
run_grpo_training(test_config, max_steps=5, timeout=1800)
print("\n" + "=" * 80)
print("✓ GRPO e2e test passed!")
print("=" * 80)
except Exception as e:
print("\n" + "=" * 80)
print(f"✗ GRPO e2e test failed: {e}")
print("=" * 80)
sys.exit(1)


if __name__ == "__main__":
main()
6 changes: 3 additions & 3 deletions tests/integration_tests/test_vllm_policy_correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import pytest

from forge.actors.policy import Policy
from forge.actors.generator import Generator
from vllm import SamplingParams
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.sampling_params import RequestOutputKind
Expand Down Expand Up @@ -51,7 +51,7 @@ async def test_same_output():
vllm_model = AsyncLLM.from_engine_args(args)

# Setup Policy service
policy = await Policy.options(
policy = await Generator.options(
procs=1, num_replicas=1, with_gpus=True
).as_service(
engine_args={
Expand Down Expand Up @@ -140,7 +140,7 @@ async def test_cache_usage():
vllm_model = AsyncLLM.from_engine_args(args)

# Setup Policy service
policy = await Policy.options(
policy = await Generator.options(
procs=1, num_replicas=1, with_gpus=True
).as_service(
engine_args={
Expand Down
Loading