meta-pytorch · JenniferWang · Oct 14, 2025 · Oct 15, 2025 · Oct 15, 2025 · Oct 15, 2025
diff --git a/.github/workflows/continuous_integration_test.yaml b/.github/workflows/continuous_integration_test.yaml
@@ -0,0 +1,69 @@
+name: Continuous Integration Tests
+
+on:
+  schedule:
+    # Runs every hour
+    - cron: '0 * * * *'
+  # TODO: remove this when merged to main
+  pull_request:
+  workflow_dispatch:
+
+concurrency:
+  group: continuous-integration-${{ github.ref }}
+  cancel-in-progress: true
+
+permissions:
+  id-token: write
+  contents: read
+
+defaults:
+  run:
+    shell: bash -l -eo pipefail {0}
+
+jobs:
+  integration_test:
+    if: github.repository_owner == 'meta-pytorch'
+    runs-on: linux.g5.12xlarge.nvidia.gpu
+    timeout-minutes: 120
+    strategy:
+      matrix:
+        python-version: ['3.10']
+    steps:
+      - name: Check out repo
+        uses: actions/checkout@v4
+      - name: Setup conda env
+        uses: conda-incubator/setup-miniconda@v2
+        with:
+          auto-update-conda: true
+          miniconda-version: "latest"
+          activate-environment: test
+          python-version: ${{ matrix.python-version }}
+      - name: Update pip
+        run: python -m pip install --upgrade pip
+      - name: Install pinned torch nightly
+        run: python -m pip install --pre torch==2.9.0.dev20250905 --no-cache-dir --index-url https://download.pytorch.org/whl/nightly/cu129
+      - name: Download and install vLLM and its dependencies
+        # TODO: this honestly could not be hackier if I tried
+        run: |
+          python -m pip install -r .github/packaging/vllm_reqs.txt
+          python -m pip install vllm==0.10.1.dev0+g6d8d0a24c.d20251009.cu129 --no-cache-dir --index-url https://download.pytorch.org/whl/preview/forge
+      - name: Install Monarch
+        run: pip install torchmonarch==0.1.0rc1
+      - name: Install torchtitan and torchstore
+        run: |
+          python -m pip install git+https://github.com/pytorch/torchtitan.git
+          python -m pip install git+https://github.com/meta-pytorch/torchstore.git
+      - name: Install dependencies
+        run: python -m pip install --no-build-isolation -e ".[dev]"
+      - name: Run vLLM policy correctness tests
+        run: |
+          export LD_PRELOAD=$CONDA/envs/test/lib/libpython3.10.so.1.0
+          export LD_LIBRARY_PATH=$CONDA/envs/test/lib/libpython3.10.so.1.0
+          pytest tests/integration_tests/test_vllm_policy_correctness.py --durations=20 -vv
+        timeout-minutes: 20
+      - name: Run GRPO e2e test
+        run: |
+          export LD_PRELOAD=$CONDA/envs/test/lib/libpython3.10.so.1.0
+          export LD_LIBRARY_PATH=$CONDA/envs/test/lib/libpython3.10.so.1.0
+          python tests/integration_tests/test_grpo_e2e.py
+        timeout-minutes: 20
diff --git a/tests/integration_tests/test_grpo_e2e.py b/tests/integration_tests/test_grpo_e2e.py
@@ -0,0 +1,127 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+End-to-end integration test for GRPO training.
+
+This test validates that GRPO training can run without crashes or exceptions.
+Similar to TorchTitan's integration test approach, we focus on functional
+correctness (no crashes) rather than numerical validation.
+
+Usage:
+    python tests/integration_tests/test_grpo_e2e.py
+"""
+
+import subprocess
+import sys
+import time
+from pathlib import Path
+
+
+def run_grpo_training(
+    config_path: str,
+    max_steps: int = 5,
+    timeout: int = 1800,
+    extra_args: list[str] | None = None,
+) -> subprocess.CompletedProcess:
+    """
+    Run GRPO training and verify it completes without crashes.
+
+    Args:
+        config_path: Path to YAML config file
+        max_steps: Number of training steps to run
+        timeout: Maximum time in seconds to wait
+        extra_args: Additional CLI arguments to pass
+
+    Returns:
+        CompletedProcess object with stdout/stderr
+
+    Raises:
+        Exception: If training fails with non-zero exit code
+    """
+    cmd = [
+        sys.executable,
+        "-m",
+        "apps.grpo.main",
+        "--config",
+        config_path,
+        f"trainer.training.steps={str(max_steps)}",
+        # Disable WandB for CI to avoid auth issues - only use console logging
+        "~metric_logging.wandb",
+    ]
+
+    if extra_args:
+        cmd.extend(extra_args)
+
+    print(f"Running GRPO e2e test: {config_path}")
+    print(f"Command: {' '.join(cmd)}")
+    print(f"Max steps: {max_steps}, Timeout: {timeout}s")
+    print("-" * 80)
+
+    start_time = time.time()
+
+    try:
+        result = subprocess.run(
+            cmd,
+            timeout=timeout,
+            capture_output=True,
+            text=True,
+        )
+    except subprocess.TimeoutExpired:
+        elapsed = time.time() - start_time
+        raise Exception(
+            f"GRPO training timed out after {elapsed:.1f}s (timeout={timeout}s)"
+        ) from None
+
+    elapsed = time.time() - start_time
+
+    # Print output for debugging
+    if result.stdout:
+        print("STDOUT:")
+        print(result.stdout[-2000:])  # Print last 2000 chars to avoid overwhelming logs
+
+    if result.stderr:
+        print("\nSTDERR:")
+        print(result.stderr[-2000:])
+
+    print("-" * 80)
+
+    # Check for success
+    if result.returncode != 0:
+        raise Exception(
+            f"GRPO training failed with return code {result.returncode} after {elapsed:.1f}s"
+        )
+
+    print(f"✓ GRPO training completed successfully in {elapsed:.1f}s")
+    return result
+
+
+def main():
+    """Run GRPO e2e test."""
+    print("=" * 80)
+    print("GRPO E2E Integration Test")
+    print("=" * 80)
+
+    # Test GRPO with smallest model
+    test_config = "apps/grpo/qwen3_1_7b.yaml"
+
+    if not Path(test_config).exists():
+        raise FileNotFoundError(f"Config file not found: {test_config}")
+
+    try:
+        run_grpo_training(test_config, max_steps=5, timeout=1800)
+        print("\n" + "=" * 80)
+        print("✓ GRPO e2e test passed!")
+        print("=" * 80)
+    except Exception as e:
+        print("\n" + "=" * 80)
+        print(f"✗ GRPO e2e test failed: {e}")
+        print("=" * 80)
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/integration_tests/test_vllm_policy_correctness.py b/tests/integration_tests/test_vllm_policy_correctness.py
@@ -6,7 +6,7 @@
 
 import pytest
 
-from forge.actors.policy import Policy
+from forge.actors.generator import Generator
 from vllm import SamplingParams
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.sampling_params import RequestOutputKind
@@ -51,7 +51,7 @@ async def test_same_output():
         vllm_model = AsyncLLM.from_engine_args(args)
 
         # Setup Policy service
-        policy = await Policy.options(
+        policy = await Generator.options(
             procs=1, num_replicas=1, with_gpus=True
         ).as_service(
             engine_args={
@@ -140,7 +140,7 @@ async def test_cache_usage():
         vllm_model = AsyncLLM.from_engine_args(args)
 
         # Setup Policy service
-        policy = await Policy.options(
+        policy = await Generator.options(
             procs=1, num_replicas=1, with_gpus=True
         ).as_service(
             engine_args={