diff --git a/.github/workflows/continuous_integration_test.yaml b/.github/workflows/continuous_integration_test.yaml new file mode 100644 index 000000000..e27d197f9 --- /dev/null +++ b/.github/workflows/continuous_integration_test.yaml @@ -0,0 +1,69 @@ +name: Continuous Integration Tests + +on: + schedule: + # Runs every hour + - cron: '0 * * * *' + # TODO: remove this when merged to main + pull_request: + workflow_dispatch: + +concurrency: + group: continuous-integration-${{ github.ref }} + cancel-in-progress: true + +permissions: + id-token: write + contents: read + +defaults: + run: + shell: bash -l -eo pipefail {0} + +jobs: + integration_test: + if: github.repository_owner == 'meta-pytorch' + runs-on: linux.g5.12xlarge.nvidia.gpu + timeout-minutes: 120 + strategy: + matrix: + python-version: ['3.10'] + steps: + - name: Check out repo + uses: actions/checkout@v4 + - name: Setup conda env + uses: conda-incubator/setup-miniconda@v2 + with: + auto-update-conda: true + miniconda-version: "latest" + activate-environment: test + python-version: ${{ matrix.python-version }} + - name: Update pip + run: python -m pip install --upgrade pip + - name: Install pinned torch nightly + run: python -m pip install --pre torch==2.9.0.dev20250905 --no-cache-dir --index-url https://download.pytorch.org/whl/nightly/cu129 + - name: Download and install vLLM and its dependencies + # TODO: this honestly could not be hackier if I tried + run: | + python -m pip install -r .github/packaging/vllm_reqs.txt + python -m pip install vllm==0.10.1.dev0+g6d8d0a24c.d20251009.cu129 --no-cache-dir --index-url https://download.pytorch.org/whl/preview/forge + - name: Install Monarch + run: pip install torchmonarch==0.1.0rc1 + - name: Install torchtitan and torchstore + run: | + python -m pip install git+https://github.com/pytorch/torchtitan.git + python -m pip install git+https://github.com/meta-pytorch/torchstore.git + - name: Install dependencies + run: python -m pip install --no-build-isolation -e ".[dev]" + - name: Run vLLM policy correctness tests + run: | + export LD_PRELOAD=$CONDA/envs/test/lib/libpython3.10.so.1.0 + export LD_LIBRARY_PATH=$CONDA/envs/test/lib/libpython3.10.so.1.0 + pytest tests/integration_tests/test_vllm_policy_correctness.py --durations=20 -vv + timeout-minutes: 20 + - name: Run GRPO e2e test + run: | + export LD_PRELOAD=$CONDA/envs/test/lib/libpython3.10.so.1.0 + export LD_LIBRARY_PATH=$CONDA/envs/test/lib/libpython3.10.so.1.0 + python tests/integration_tests/test_grpo_e2e.py + timeout-minutes: 20 diff --git a/tests/integration_tests/test_grpo_e2e.py b/tests/integration_tests/test_grpo_e2e.py new file mode 100644 index 000000000..f59b57eb5 --- /dev/null +++ b/tests/integration_tests/test_grpo_e2e.py @@ -0,0 +1,127 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +""" +End-to-end integration test for GRPO training. + +This test validates that GRPO training can run without crashes or exceptions. +Similar to TorchTitan's integration test approach, we focus on functional +correctness (no crashes) rather than numerical validation. + +Usage: + python tests/integration_tests/test_grpo_e2e.py +""" + +import subprocess +import sys +import time +from pathlib import Path + + +def run_grpo_training( + config_path: str, + max_steps: int = 5, + timeout: int = 1800, + extra_args: list[str] | None = None, +) -> subprocess.CompletedProcess: + """ + Run GRPO training and verify it completes without crashes. + + Args: + config_path: Path to YAML config file + max_steps: Number of training steps to run + timeout: Maximum time in seconds to wait + extra_args: Additional CLI arguments to pass + + Returns: + CompletedProcess object with stdout/stderr + + Raises: + Exception: If training fails with non-zero exit code + """ + cmd = [ + sys.executable, + "-m", + "apps.grpo.main", + "--config", + config_path, + f"trainer.training.steps={str(max_steps)}", + # Disable WandB for CI to avoid auth issues - only use console logging + "~metric_logging.wandb", + ] + + if extra_args: + cmd.extend(extra_args) + + print(f"Running GRPO e2e test: {config_path}") + print(f"Command: {' '.join(cmd)}") + print(f"Max steps: {max_steps}, Timeout: {timeout}s") + print("-" * 80) + + start_time = time.time() + + try: + result = subprocess.run( + cmd, + timeout=timeout, + capture_output=True, + text=True, + ) + except subprocess.TimeoutExpired: + elapsed = time.time() - start_time + raise Exception( + f"GRPO training timed out after {elapsed:.1f}s (timeout={timeout}s)" + ) from None + + elapsed = time.time() - start_time + + # Print output for debugging + if result.stdout: + print("STDOUT:") + print(result.stdout[-2000:]) # Print last 2000 chars to avoid overwhelming logs + + if result.stderr: + print("\nSTDERR:") + print(result.stderr[-2000:]) + + print("-" * 80) + + # Check for success + if result.returncode != 0: + raise Exception( + f"GRPO training failed with return code {result.returncode} after {elapsed:.1f}s" + ) + + print(f"✓ GRPO training completed successfully in {elapsed:.1f}s") + return result + + +def main(): + """Run GRPO e2e test.""" + print("=" * 80) + print("GRPO E2E Integration Test") + print("=" * 80) + + # Test GRPO with smallest model + test_config = "apps/grpo/qwen3_1_7b.yaml" + + if not Path(test_config).exists(): + raise FileNotFoundError(f"Config file not found: {test_config}") + + try: + run_grpo_training(test_config, max_steps=5, timeout=1800) + print("\n" + "=" * 80) + print("✓ GRPO e2e test passed!") + print("=" * 80) + except Exception as e: + print("\n" + "=" * 80) + print(f"✗ GRPO e2e test failed: {e}") + print("=" * 80) + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/tests/integration_tests/test_vllm_policy_correctness.py b/tests/integration_tests/test_vllm_policy_correctness.py index e2da9b068..0208e81aa 100644 --- a/tests/integration_tests/test_vllm_policy_correctness.py +++ b/tests/integration_tests/test_vllm_policy_correctness.py @@ -6,7 +6,7 @@ import pytest -from forge.actors.policy import Policy +from forge.actors.generator import Generator from vllm import SamplingParams from vllm.engine.arg_utils import AsyncEngineArgs from vllm.sampling_params import RequestOutputKind @@ -51,7 +51,7 @@ async def test_same_output(): vllm_model = AsyncLLM.from_engine_args(args) # Setup Policy service - policy = await Policy.options( + policy = await Generator.options( procs=1, num_replicas=1, with_gpus=True ).as_service( engine_args={ @@ -140,7 +140,7 @@ async def test_cache_usage(): vllm_model = AsyncLLM.from_engine_args(args) # Setup Policy service - policy = await Policy.options( + policy = await Generator.options( procs=1, num_replicas=1, with_gpus=True ).as_service( engine_args={