From 9e495b8eed306da57930c53005fc619b45fc0b98 Mon Sep 17 00:00:00 2001 From: "Jiyue (Jennifer) Wang" Date: Tue, 14 Oct 2025 16:46:12 -0400 Subject: [PATCH 1/8] initial commit --- .github/workflows/gpu_test.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/gpu_test.yaml b/.github/workflows/gpu_test.yaml index c2a4705e5..f39f89916 100644 --- a/.github/workflows/gpu_test.yaml +++ b/.github/workflows/gpu_test.yaml @@ -61,5 +61,10 @@ jobs: export LD_PRELOAD=$CONDA/envs/test/lib/libpython3.10.so.1.0 export LD_LIBRARY_PATH=$CONDA/envs/test/lib/libpython3.10.so.1.0 pytest tests/unit_tests --cov=. --cov-report=xml --durations=20 -vv + - name: Run integration tests + run: | + export LD_PRELOAD=$CONDA/envs/test/lib/libpython3.10.so.1.0 + export LD_LIBRARY_PATH=$CONDA/envs/test/lib/libpython3.10.so.1.0 + python -m tests.integration_tests.test_vllm_policy_correctness - name: Upload Coverage to Codecov uses: codecov/codecov-action@v3 From cbd91bb29f7415fb0628f71e680857b6c6dc17e3 Mon Sep 17 00:00:00 2001 From: Jiyue Wang Date: Wed, 15 Oct 2025 17:16:12 -0400 Subject: [PATCH 2/8] create a new workflow for continuous tests --- .../continuous_integration_test.yaml | 60 +++++++++++++++++++ .github/workflows/gpu_test.yaml | 5 -- .../test_vllm_policy_correctness.py | 4 +- 3 files changed, 62 insertions(+), 7 deletions(-) create mode 100644 .github/workflows/continuous_integration_test.yaml diff --git a/.github/workflows/continuous_integration_test.yaml b/.github/workflows/continuous_integration_test.yaml new file mode 100644 index 000000000..8f0f72cc8 --- /dev/null +++ b/.github/workflows/continuous_integration_test.yaml @@ -0,0 +1,60 @@ +name: Continuous Integration Tests + +on: + schedule: + # Runs every hour + - cron: '0 * * * *' + workflow_dispatch: + +concurrency: + group: continuous-integration-${{ github.ref }} + cancel-in-progress: true + +permissions: + id-token: write + contents: read + +defaults: + run: + shell: bash -l -eo pipefail {0} + +jobs: + integration_test: + if: github.repository_owner == 'meta-pytorch' + runs-on: linux.g5.12xlarge.nvidia.gpu + timeout-minutes: 120 + strategy: + matrix: + python-version: ['3.10'] + steps: + - name: Check out repo + uses: actions/checkout@v4 + - name: Setup conda env + uses: conda-incubator/setup-miniconda@v2 + with: + auto-update-conda: true + miniconda-version: "latest" + activate-environment: test + python-version: ${{ matrix.python-version }} + - name: Update pip + run: python -m pip install --upgrade pip + - name: Install pinned torch nightly + run: python -m pip install --pre torch==2.9.0.dev20250905 --no-cache-dir --index-url https://download.pytorch.org/whl/nightly/cu129 + - name: Download and install vLLM and its dependencies + # TODO: this honestly could not be hackier if I tried + run: | + python -m pip install -r .github/packaging/vllm_reqs.txt + python -m pip install vllm==0.10.1.dev0+g6d8d0a24c.d20251009.cu129 --no-cache-dir --index-url https://download.pytorch.org/whl/preview/forge + - name: Install Monarch + run: pip install torchmonarch==0.1.0rc1 + - name: Install torchtitan and torchstore + run: | + python -m pip install git+https://github.com/pytorch/torchtitan.git + python -m pip install git+https://github.com/meta-pytorch/torchstore.git + - name: Install dependencies + run: python -m pip install --no-build-isolation -e ".[dev]" + - name: Run integration tests + run: | + export LD_PRELOAD=$CONDA/envs/test/lib/libpython3.10.so.1.0 + export LD_LIBRARY_PATH=$CONDA/envs/test/lib/libpython3.10.so.1.0 + pytest tests/integration_tests/test_vllm_policy_correctness.py --durations=20 -vv diff --git a/.github/workflows/gpu_test.yaml b/.github/workflows/gpu_test.yaml index f39f89916..c2a4705e5 100644 --- a/.github/workflows/gpu_test.yaml +++ b/.github/workflows/gpu_test.yaml @@ -61,10 +61,5 @@ jobs: export LD_PRELOAD=$CONDA/envs/test/lib/libpython3.10.so.1.0 export LD_LIBRARY_PATH=$CONDA/envs/test/lib/libpython3.10.so.1.0 pytest tests/unit_tests --cov=. --cov-report=xml --durations=20 -vv - - name: Run integration tests - run: | - export LD_PRELOAD=$CONDA/envs/test/lib/libpython3.10.so.1.0 - export LD_LIBRARY_PATH=$CONDA/envs/test/lib/libpython3.10.so.1.0 - python -m tests.integration_tests.test_vllm_policy_correctness - name: Upload Coverage to Codecov uses: codecov/codecov-action@v3 diff --git a/tests/integration_tests/test_vllm_policy_correctness.py b/tests/integration_tests/test_vllm_policy_correctness.py index e2da9b068..2a47cf2b8 100644 --- a/tests/integration_tests/test_vllm_policy_correctness.py +++ b/tests/integration_tests/test_vllm_policy_correctness.py @@ -6,7 +6,7 @@ import pytest -from forge.actors.policy import Policy +from forge.actors.generator import Generator from vllm import SamplingParams from vllm.engine.arg_utils import AsyncEngineArgs from vllm.sampling_params import RequestOutputKind @@ -51,7 +51,7 @@ async def test_same_output(): vllm_model = AsyncLLM.from_engine_args(args) # Setup Policy service - policy = await Policy.options( + policy = await Generator.options( procs=1, num_replicas=1, with_gpus=True ).as_service( engine_args={ From 714cfc62bf989d31567aa49e9eae1dbab82903ef Mon Sep 17 00:00:00 2001 From: Jiyue Wang Date: Wed, 15 Oct 2025 17:25:40 -0400 Subject: [PATCH 3/8] ... --- .github/workflows/continuous_integration_test.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/continuous_integration_test.yaml b/.github/workflows/continuous_integration_test.yaml index 8f0f72cc8..6b919c682 100644 --- a/.github/workflows/continuous_integration_test.yaml +++ b/.github/workflows/continuous_integration_test.yaml @@ -4,6 +4,8 @@ on: schedule: # Runs every hour - cron: '0 * * * *' + # TODO: remove this when merged to main + pull_request: workflow_dispatch: concurrency: From c42d660fe412aeba299e5f38bc810d42305f7f45 Mon Sep 17 00:00:00 2001 From: Jiyue Wang Date: Wed, 15 Oct 2025 17:35:09 -0400 Subject: [PATCH 4/8] fix one more place --- tests/integration_tests/test_vllm_policy_correctness.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration_tests/test_vllm_policy_correctness.py b/tests/integration_tests/test_vllm_policy_correctness.py index 2a47cf2b8..0208e81aa 100644 --- a/tests/integration_tests/test_vllm_policy_correctness.py +++ b/tests/integration_tests/test_vllm_policy_correctness.py @@ -140,7 +140,7 @@ async def test_cache_usage(): vllm_model = AsyncLLM.from_engine_args(args) # Setup Policy service - policy = await Policy.options( + policy = await Generator.options( procs=1, num_replicas=1, with_gpus=True ).as_service( engine_args={ From 7d02cd044025c45782eb28cf251292ca164fe2c8 Mon Sep 17 00:00:00 2001 From: Jiyue Wang Date: Thu, 16 Oct 2025 09:28:52 -0400 Subject: [PATCH 5/8] add grpo run --- .github/workflows/continuous_integration_test.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/continuous_integration_test.yaml b/.github/workflows/continuous_integration_test.yaml index 6b919c682..6f1a053fb 100644 --- a/.github/workflows/continuous_integration_test.yaml +++ b/.github/workflows/continuous_integration_test.yaml @@ -60,3 +60,4 @@ jobs: export LD_PRELOAD=$CONDA/envs/test/lib/libpython3.10.so.1.0 export LD_LIBRARY_PATH=$CONDA/envs/test/lib/libpython3.10.so.1.0 pytest tests/integration_tests/test_vllm_policy_correctness.py --durations=20 -vv + python -m apps.grpo.main --config apps/grpo/qwen3_1_7b.yaml From 664de1cb50a9709f2f968d7e9de0517f41663768 Mon Sep 17 00:00:00 2001 From: Jiyue Wang Date: Thu, 16 Oct 2025 09:48:14 -0400 Subject: [PATCH 6/8] add run_e2e_tests.py --- .../continuous_integration_test.yaml | 10 +- tests/integration_tests/run_e2e_tests.py | 129 ++++++++++++++++++ 2 files changed, 137 insertions(+), 2 deletions(-) create mode 100644 tests/integration_tests/run_e2e_tests.py diff --git a/.github/workflows/continuous_integration_test.yaml b/.github/workflows/continuous_integration_test.yaml index 6f1a053fb..34469710e 100644 --- a/.github/workflows/continuous_integration_test.yaml +++ b/.github/workflows/continuous_integration_test.yaml @@ -55,9 +55,15 @@ jobs: python -m pip install git+https://github.com/meta-pytorch/torchstore.git - name: Install dependencies run: python -m pip install --no-build-isolation -e ".[dev]" - - name: Run integration tests + - name: Run vLLM policy correctness tests run: | export LD_PRELOAD=$CONDA/envs/test/lib/libpython3.10.so.1.0 export LD_LIBRARY_PATH=$CONDA/envs/test/lib/libpython3.10.so.1.0 pytest tests/integration_tests/test_vllm_policy_correctness.py --durations=20 -vv - python -m apps.grpo.main --config apps/grpo/qwen3_1_7b.yaml + timeout-minutes: 20 + - name: Run e2e GRPO training test + run: | + export LD_PRELOAD=$CONDA/envs/test/lib/libpython3.10.so.1.0 + export LD_LIBRARY_PATH=$CONDA/envs/test/lib/libpython3.10.so.1.0 + python tests/integration_tests/run_e2e_tests.py + timeout-minutes: 30 diff --git a/tests/integration_tests/run_e2e_tests.py b/tests/integration_tests/run_e2e_tests.py new file mode 100644 index 000000000..baf8dcc1b --- /dev/null +++ b/tests/integration_tests/run_e2e_tests.py @@ -0,0 +1,129 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +""" +End-to-end integration test runner for Forge applications. + +This test runner validates that training can run without crashes or exceptions. +Similar to TorchTitan's integration test approach, we focus on functional +correctness (no crashes) rather than numerical validation. + +Usage: + python tests/integration_tests/run_e2e_tests.py +""" + +import subprocess +import sys +import time +from pathlib import Path + + +def run_grpo_test( + config_path: str, + max_steps: int = 5, + timeout: int = 1800, + extra_args: list[str] | None = None, +) -> subprocess.CompletedProcess: + """ + Run GRPO training and verify it completes without crashes. + + Args: + config_path: Path to YAML config file + max_steps: Number of training steps to run + timeout: Maximum time in seconds to wait + extra_args: Additional CLI arguments to pass + + Returns: + CompletedProcess object with stdout/stderr + + Raises: + Exception: If training fails with non-zero exit code + """ + cmd = [ + sys.executable, + "-m", + "apps.grpo.main", + "--config", + config_path, + "--trainer.training.steps", + str(max_steps), + # Disable WandB for CI to avoid auth issues - only use console logging + "--metric_logging", + '{"console": {"reduce_across_ranks": true}}', + ] + + if extra_args: + cmd.extend(extra_args) + + print(f"Running e2e test: {config_path}") + print(f"Command: {' '.join(cmd)}") + print(f"Max steps: {max_steps}, Timeout: {timeout}s") + print("-" * 80) + + start_time = time.time() + + try: + result = subprocess.run( + cmd, + timeout=timeout, + capture_output=True, + text=True, + ) + except subprocess.TimeoutExpired: + elapsed = time.time() - start_time + raise Exception( + f"Training timed out after {elapsed:.1f}s (timeout={timeout}s)" + ) + + elapsed = time.time() - start_time + + # Print output for debugging + if result.stdout: + print("STDOUT:") + print(result.stdout[-2000:]) # Print last 2000 chars to avoid overwhelming logs + + if result.stderr: + print("\nSTDERR:") + print(result.stderr[-2000:]) + + print("-" * 80) + + # Check for success + if result.returncode != 0: + raise Exception( + f"Training failed with return code {result.returncode} after {elapsed:.1f}s" + ) + + print(f"✓ Training completed successfully in {elapsed:.1f}s") + return result + + +def main(): + """Run all e2e tests.""" + print("=" * 80) + print("Forge E2E Integration Tests") + print("=" * 80) + + # Test 1: GRPO with smallest model + test_config = "apps/grpo/qwen3_1_7b.yaml" + + if not Path(test_config).exists(): + raise FileNotFoundError(f"Config file not found: {test_config}") + + try: + run_grpo_test(test_config, max_steps=5, timeout=1800) + print("\n" + "=" * 80) + print("✓ All e2e tests passed!") + print("=" * 80) + except Exception as e: + print("\n" + "=" * 80) + print(f"✗ E2E test failed: {e}") + print("=" * 80) + sys.exit(1) + + +if __name__ == "__main__": + main() From 86a1ca7f9088dcecf61d90c75da0d607a0ba0536 Mon Sep 17 00:00:00 2001 From: Jiyue Wang Date: Thu, 16 Oct 2025 09:55:58 -0400 Subject: [PATCH 7/8] more changes --- .../continuous_integration_test.yaml | 6 ++-- .../{run_e2e_tests.py => test_grpo_e2e.py} | 28 +++++++++---------- 2 files changed, 17 insertions(+), 17 deletions(-) rename tests/integration_tests/{run_e2e_tests.py => test_grpo_e2e.py} (77%) diff --git a/.github/workflows/continuous_integration_test.yaml b/.github/workflows/continuous_integration_test.yaml index 34469710e..e27d197f9 100644 --- a/.github/workflows/continuous_integration_test.yaml +++ b/.github/workflows/continuous_integration_test.yaml @@ -61,9 +61,9 @@ jobs: export LD_LIBRARY_PATH=$CONDA/envs/test/lib/libpython3.10.so.1.0 pytest tests/integration_tests/test_vllm_policy_correctness.py --durations=20 -vv timeout-minutes: 20 - - name: Run e2e GRPO training test + - name: Run GRPO e2e test run: | export LD_PRELOAD=$CONDA/envs/test/lib/libpython3.10.so.1.0 export LD_LIBRARY_PATH=$CONDA/envs/test/lib/libpython3.10.so.1.0 - python tests/integration_tests/run_e2e_tests.py - timeout-minutes: 30 + python tests/integration_tests/test_grpo_e2e.py + timeout-minutes: 20 diff --git a/tests/integration_tests/run_e2e_tests.py b/tests/integration_tests/test_grpo_e2e.py similarity index 77% rename from tests/integration_tests/run_e2e_tests.py rename to tests/integration_tests/test_grpo_e2e.py index baf8dcc1b..97d7f1827 100644 --- a/tests/integration_tests/run_e2e_tests.py +++ b/tests/integration_tests/test_grpo_e2e.py @@ -5,14 +5,14 @@ # LICENSE file in the root directory of this source tree. """ -End-to-end integration test runner for Forge applications. +End-to-end integration test for GRPO training. -This test runner validates that training can run without crashes or exceptions. +This test validates that GRPO training can run without crashes or exceptions. Similar to TorchTitan's integration test approach, we focus on functional correctness (no crashes) rather than numerical validation. Usage: - python tests/integration_tests/run_e2e_tests.py + python tests/integration_tests/test_grpo_e2e.py """ import subprocess @@ -21,7 +21,7 @@ from pathlib import Path -def run_grpo_test( +def run_grpo_training( config_path: str, max_steps: int = 5, timeout: int = 1800, @@ -58,7 +58,7 @@ def run_grpo_test( if extra_args: cmd.extend(extra_args) - print(f"Running e2e test: {config_path}") + print(f"Running GRPO e2e test: {config_path}") print(f"Command: {' '.join(cmd)}") print(f"Max steps: {max_steps}, Timeout: {timeout}s") print("-" * 80) @@ -75,7 +75,7 @@ def run_grpo_test( except subprocess.TimeoutExpired: elapsed = time.time() - start_time raise Exception( - f"Training timed out after {elapsed:.1f}s (timeout={timeout}s)" + f"GRPO training timed out after {elapsed:.1f}s (timeout={timeout}s)" ) elapsed = time.time() - start_time @@ -94,33 +94,33 @@ def run_grpo_test( # Check for success if result.returncode != 0: raise Exception( - f"Training failed with return code {result.returncode} after {elapsed:.1f}s" + f"GRPO training failed with return code {result.returncode} after {elapsed:.1f}s" ) - print(f"✓ Training completed successfully in {elapsed:.1f}s") + print(f"✓ GRPO training completed successfully in {elapsed:.1f}s") return result def main(): - """Run all e2e tests.""" + """Run GRPO e2e test.""" print("=" * 80) - print("Forge E2E Integration Tests") + print("GRPO E2E Integration Test") print("=" * 80) - # Test 1: GRPO with smallest model + # Test GRPO with smallest model test_config = "apps/grpo/qwen3_1_7b.yaml" if not Path(test_config).exists(): raise FileNotFoundError(f"Config file not found: {test_config}") try: - run_grpo_test(test_config, max_steps=5, timeout=1800) + run_grpo_training(test_config, max_steps=5, timeout=1800) print("\n" + "=" * 80) - print("✓ All e2e tests passed!") + print("✓ GRPO e2e test passed!") print("=" * 80) except Exception as e: print("\n" + "=" * 80) - print(f"✗ E2E test failed: {e}") + print(f"✗ GRPO e2e test failed: {e}") print("=" * 80) sys.exit(1) From 65543f4e4e44f3cb8fdff4dbbf3bdd4881f96ab0 Mon Sep 17 00:00:00 2001 From: "Jiyue (Jennifer) Wang" Date: Thu, 16 Oct 2025 11:20:19 -0400 Subject: [PATCH 8/8] try fix the wandb issue --- tests/integration_tests/test_grpo_e2e.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/tests/integration_tests/test_grpo_e2e.py b/tests/integration_tests/test_grpo_e2e.py index 97d7f1827..f59b57eb5 100644 --- a/tests/integration_tests/test_grpo_e2e.py +++ b/tests/integration_tests/test_grpo_e2e.py @@ -48,11 +48,9 @@ def run_grpo_training( "apps.grpo.main", "--config", config_path, - "--trainer.training.steps", - str(max_steps), + f"trainer.training.steps={str(max_steps)}", # Disable WandB for CI to avoid auth issues - only use console logging - "--metric_logging", - '{"console": {"reduce_across_ranks": true}}', + "~metric_logging.wandb", ] if extra_args: @@ -76,7 +74,7 @@ def run_grpo_training( elapsed = time.time() - start_time raise Exception( f"GRPO training timed out after {elapsed:.1f}s (timeout={timeout}s)" - ) + ) from None elapsed = time.time() - start_time