Skip to content
Merged
Show file tree
Hide file tree
Changes from 29 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
3abb3e8
enable vLLM upload with CUDA 12.8 build
ebsmothers Oct 20, 2025
48a11a5
install stable torch
ebsmothers Oct 20, 2025
395c1ab
copy vllm_reqs.txt from joe's pr
ebsmothers Oct 20, 2025
f0878f7
remove nightly index path
ebsmothers Oct 20, 2025
994b400
Merge branch 'main' into cuda-128
ebsmothers Oct 21, 2025
85421b5
use install script in GHA workflows
ebsmothers Oct 21, 2025
fa02756
move to install script
ebsmothers Oct 21, 2025
b370d97
update vLLM requirements files
ebsmothers Oct 21, 2025
4d2a103
[testing] run unit test on 3.10 only
ebsmothers Oct 21, 2025
ffdaa52
Revert "[testing] run unit test on 3.10 only"
ebsmothers Oct 21, 2025
f9a6620
wip changes
ebsmothers Oct 21, 2025
80462a1
Merge branch 'main' into cuda-128
ebsmothers Oct 21, 2025
5f260eb
comments
ebsmothers Oct 21, 2025
be0069a
remove unused wheels
ebsmothers Oct 21, 2025
6af94cd
more cleanup
ebsmothers Oct 21, 2025
9eb1bc1
update versions
ebsmothers Oct 21, 2025
4044dc1
simplify things
ebsmothers Oct 22, 2025
7a26f8a
revert docs changes
ebsmothers Oct 22, 2025
6c65fad
update requirements and script
ebsmothers Oct 22, 2025
e55e235
one more try
ebsmothers Oct 22, 2025
4ab4caf
try python 3.11
ebsmothers Oct 22, 2025
67a18c0
run only on 310
joecummings Oct 22, 2025
100f584
Comment out hopefully unnecessary parts of build
joecummings Oct 22, 2025
d10a8c9
You need a GPU runner to run stuff
joecummings Oct 22, 2025
95c2f25
Install six
joecummings Oct 22, 2025
c9b2a62
Simplify docs build
joecummings Oct 22, 2025
27f77df
setuptools < 80
joecummings Oct 22, 2025
0995062
ugh, quote it
joecummings Oct 22, 2025
49be5a1
Use py 312 in main ex
joecummings Oct 22, 2025
008fad8
Remove build refernce
joecummings Oct 22, 2025
dc79ad8
delete cpu unit test workflow
ebsmothers Oct 22, 2025
bb15ede
typo fix
ebsmothers Oct 22, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/packaging/pre_build_cpu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ echo "wheel dir is $WHL_DIR"
build_vllm() {
cd "$BUILD_DIR"

git clone https://github.com/vllm-project/vllm.git --branch $VLLM_BRANCH
git clone https://github.com/vllm-project/vllm.git --branch $VLLM_VERSION
cd "$BUILD_DIR/vllm"

python use_existing_torch.py
Expand Down
27 changes: 10 additions & 17 deletions .github/packaging/vllm_reqs_12_8.txt
Original file line number Diff line number Diff line change
@@ -1,11 +1,4 @@
# These requirements were generated by running steps 1-3 of scripts/build_wheels.sh
# then running pip freeze and manually removing the vllm dependency.
# The intention of this file is to use these known requirements for a fixed
# vLLM build to supplement a vLLM install from download.pytorch.org without
# resorting to --extra-index-url https://download.pytorch.org/whl/nightly to find
# vLLM dependencies (as this results in a ResolutionTooDeep error from pip).
# See the file .github/workflows/gpu_test.yaml for an E2E forge installation using this approach.
# TODO: this should be done way less hackily
# This file was generated by running ./scripts/generate_vllm_reqs.sh
aiohappyeyeballs==2.6.1
aiohttp==3.13.1
aiosignal==1.4.0
Expand Down Expand Up @@ -33,8 +26,8 @@ dnspython==2.8.0
einops==0.8.1
email-validator==2.3.0
exceptiongroup==1.3.0
fastapi==0.119.0
fastapi-cli==0.0.13
fastapi==0.119.1
fastapi-cli==0.0.14
fastapi-cloud-cli==0.3.1
fastrlock==0.8.3
filelock==3.19.1
Expand Down Expand Up @@ -94,7 +87,7 @@ prometheus-fastapi-instrumentator==7.1.0
prometheus_client==0.23.1
propcache==0.4.1
protobuf==6.33.0
psutil==7.1.0
psutil==7.1.1
py-cpuinfo==9.0.0
pybase64==1.4.2
pycountry==24.6.1
Expand All @@ -108,9 +101,9 @@ python-json-logger==4.0.0
python-multipart==0.0.20
PyYAML==6.0.3
pyzmq==27.1.0
ray==2.50.0
ray==2.50.1
referencing==0.37.0
regex==2025.9.18
regex==2025.10.23
requests==2.32.5
rich==14.2.0
rich-toolkit==0.15.1
Expand All @@ -119,8 +112,8 @@ rpds-py==0.27.1
safetensors==0.6.2
scipy==1.15.3
sentencepiece==0.2.1
sentry-sdk==2.42.0
setuptools-scm==9.2.1
sentry-sdk==2.42.1
setuptools-scm==9.2.2
shellingham==1.5.4
sniffio==1.3.1
soundfile==0.13.1
Expand All @@ -134,11 +127,11 @@ torch==2.9.0+cu128
tqdm==4.67.1
transformers==4.57.1
triton==3.5.0
typer==0.19.2
typer==0.20.0
typing-inspection==0.4.2
typing_extensions==4.15.0
urllib3==2.5.0
uvicorn==0.37.0
uvicorn==0.38.0
uvloop==0.22.1
watchfiles==1.1.1
websockets==15.0.1
Expand Down
45 changes: 1 addition & 44 deletions .github/workflows/docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,62 +26,19 @@ jobs:
activate-environment: test
python-version: '3.10'
auto-activate: false
- name: Verify conda environment
shell: bash -l {0}
run: |
conda info
which python
which conda
- name: Update pip
shell: bash -l {0}
run: python -m pip install --upgrade pip
- name: Install pytorch
shell: bash -l {0}
run: pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu130 --force-reinstall
- name: Install monarch
shell: bash -l {0}
run: pip install assets/ci/monarch_no_torch-0.1.0.dev20251010-py3-none-any.whl
- name: Install torchforge
shell: bash -l {0}
env:
GH_TOKEN: ${{ github.token }}
run: ./scripts/install.sh
- name: Install docs dependencies
shell: bash -l {0}
run: python -m pip install -r docs/requirements.txt
- name: Build docs
shell: bash -l {0}
working-directory: docs
run: |
# Set up library paths to ensure all dependencies are available
# This is critical for monarch and other native dependencies that need libpython3.10.so.1.0
export LD_LIBRARY_PATH="${CONDA_PREFIX}/lib:${LD_LIBRARY_PATH:-}"

# Also set CUDA paths if needed
if [ -d "/usr/local/cuda-12.9" ]; then
export LD_LIBRARY_PATH="/usr/local/cuda-12.9/compat:${LD_LIBRARY_PATH}"
export CUDA_HOME=/usr/local/cuda-12.9
fi

# Verify dependencies can be imported before building docs
echo "Verifying dependencies..."
python -c "import forge; print('✓ torchforge imported successfully')"
python -c "import monarch; print('✓ monarch imported successfully')"

# Build docs with -W (warnings as errors) and --keep-going to see all issues
# Capture exit code but continue to see all errors
set +e
make html SPHINXOPTS="-W --keep-going"
BUILD_EXIT_CODE=$?
set -e

# Report results
if [ $BUILD_EXIT_CODE -ne 0 ]; then
echo "❌ Documentation build failed with warnings or errors (exit code: $BUILD_EXIT_CODE)"
exit $BUILD_EXIT_CODE
else
echo "✅ Documentation build completed successfully with no warnings or errors"
fi
run: make html
- name: Upload docs artifact
uses: actions/upload-artifact@v4
with:
Expand Down
29 changes: 5 additions & 24 deletions .github/workflows/gpu_test.yaml
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
name: GPU Tests
name: Unit Tests (GPU)

on:
schedule:
# Runs at midnight every day
- cron: '0 0 * * *'
push:
branches: [ main ]
pull_request:
Expand All @@ -27,7 +24,7 @@ jobs:
runs-on: linux.g5.12xlarge.nvidia.gpu
strategy:
matrix:
python-version: ['3.10']
python-version: ['3.10', '3.11', '3.12']
steps:
- name: Check out repo
uses: actions/checkout@v4
Expand All @@ -40,26 +37,10 @@ jobs:
python-version: ${{ matrix.python-version }}
- name: Update pip
run: python -m pip install --upgrade pip
- name: Install pinned torch nightly
run: python -m pip install --pre torch==2.9.0.dev20250905 --no-cache-dir --index-url https://download.pytorch.org/whl/nightly/cu129
- name: Download and install vLLM and its dependencies
# TODO: this honestly could not be hackier if I tried
run: |
python -m pip install -r .github/packaging/vllm_reqs_12_9.txt
python -m pip install vllm==0.10.1.dev0+g6d8d0a24c.d20251009.cu129 --no-cache-dir --index-url https://download.pytorch.org/whl/preview/forge
- name: Install Monarch
run: pip install torchmonarch==0.1.0rc1
- name: Install torchtitan and torchstore
run: |
python -m pip install git+https://github.com/pytorch/torchtitan.git
python -m pip install git+https://github.com/meta-pytorch/torchstore.git
- name: Install dependencies
run: python -m pip install --no-build-isolation -e ".[dev]"
- name: Install torchforge
run: ./scripts/install.sh
- name: Run unit tests with coverage
# TODO add all tests
run: |
export LD_PRELOAD=$CONDA/envs/test/lib/libpython3.10.so.1.0
export LD_LIBRARY_PATH=$CONDA/envs/test/lib/libpython3.10.so.1.0
pytest tests/unit_tests --cov=. --cov-report=xml --durations=20 -vv
run: pytest tests/unit_tests --cov=. --cov-report=xml --durations=20 -vv
- name: Upload Coverage to Codecov
uses: codecov/codecov-action@v3
84 changes: 42 additions & 42 deletions .github/workflows/unit_test.yaml
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These won't work b/c u need a GPU runner

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

wait why? they were working before, no?

Original file line number Diff line number Diff line change
@@ -1,44 +1,44 @@
name: Unit Tests
# name: Unit Tests

on:
pull_request:
push:
branches: [ main ]
workflow_dispatch:
# on:
# pull_request:
# push:
# branches: [ main ]
# workflow_dispatch:

jobs:
unit_tests:
runs-on: ubuntu-latest
timeout-minutes: 15
strategy:
matrix:
python-version: ['3.10', '3.11', '3.12']
steps:
- name: Check out repo
uses: actions/checkout@v4
- name: Setup conda env
uses: conda-incubator/setup-miniconda@v2
with:
auto-update-conda: true
miniconda-version: "latest"
activate-environment: test
python-version: ${{ matrix.python-version }}
- name: Update pip
run: python -m pip install --upgrade pip
- name: Install pytorch
run: python -m pip install torch==2.9.0.dev20250826 --extra-index-url https://download.pytorch.org/whl/nightly/cpu
- name: Install monarch
run: pip install assets/ci/monarch_no_torch-0.1.0.dev20251010-py3-none-any.whl
- name: Install torchstore
run: pip install assets/wheels/torchstore-0.1.0-py3-none-any.whl
- name: Install torchtitan
run: |
pip install assets/wheels/torchtitan-0.1.0-py3-none-any.whl
pip install tyro
- name: Install dependencies
run: python -m pip install --no-build-isolation -e ".[dev]"
- name: Run unit tests with coverage
# TODO add all tests
run: pytest tests/unit_tests --cov=. --cov-report=xml --durations=20 -vv
- name: Upload Coverage to Codecov
uses: codecov/codecov-action@v3
# jobs:
# unit_tests:
# runs-on: ubuntu-latest
# timeout-minutes: 15
# strategy:
# matrix:
# python-version: ['3.10']
# steps:
# - name: Check out repo
# uses: actions/checkout@v4
# with:
# fetch-depth: 0
# - name: Setup conda env
# uses: conda-incubator/setup-miniconda@v2
# with:
# auto-update-conda: true
# miniconda-version: "latest"
# activate-environment: test
# python-version: '3.10'
# auto-activate: false
# - name: Verify conda environment
# shell: bash -l {0}
# run: |
# conda info
# which python
# which conda
# - name: Update pip
# shell: bash -l {0}
# run: python -m pip install --upgrade pip
# - name: Install torchforge
# run: ./scripts/install.sh
# - name: Run unit tests with coverage
# # TODO add all tests
# run: pytest tests/unit_tests --cov=. --cov-report=xml --durations=20 -vv
# - name: Upload Coverage to Codecov
# uses: codecov/codecov-action@v3
8 changes: 4 additions & 4 deletions .meta/mast/env_setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -150,10 +150,10 @@ if [ -f "$VERSIONS_FILE" ]; then
log_info "Sourcing version information from: $VERSIONS_FILE"
source "$VERSIONS_FILE"

if [ -n "$TORCHTITAN_COMMIT" ]; then
log_info "Installing torchtitan from commit: $TORCHTITAN_COMMIT"
if [ -n "$TORCHTITAN_COMMIT_MAST" ]; then
log_info "Installing torchtitan from commit: $TORCHTITAN_COMMIT_MAST"
pip uninstall -y torchtitan
pip install "git+https://github.com/pytorch/torchtitan.git@$TORCHTITAN_COMMIT"
pip install "git+https://github.com/pytorch/torchtitan.git@$TORCHTITAN_COMMIT_MAST"

if [ $? -eq 0 ]; then
log_info "Torchtitan installed successfully"
Expand All @@ -162,7 +162,7 @@ if [ -f "$VERSIONS_FILE" ]; then
exit 1
fi
else
log_error "TORCHTITAN_COMMIT not found in versions.sh"
log_error "TORCHTITAN_COMMIT_MAST not found in versions.sh"
exit 1
fi
else
Expand Down
8 changes: 2 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
# <img width="35" height="35" alt="image" src="https://github.com/user-attachments/assets/2700a971-e5d6-4036-b03f-2f89c9791609" /> torchforge

#### A PyTorch-native agentic RL library that lets you focus on algorithms—not infra.
[![Unit Tests](https://github.com/meta-pytorch/forge/actions/workflows/unit_test.yaml/badge.svg?branch=main)](https://github.com/meta-pytorch/forge/actions/workflows/unit_test.yaml?query=branch%3Amain)
[![GPU Tests](https://github.com/meta-pytorch/forge/actions/workflows/gpu_test.yaml/badge.svg?branch=main)](https://github.com/meta-pytorch/forge/actions/workflows/gpu_test.yaml?query=branch%3Amain)

## Overview
Expand Down Expand Up @@ -31,14 +30,11 @@ You can also find our notebook tutorials (coming soon)

### Basic

torchforge requires the latest PyTorch nightly with [Monarch](https://github.com/meta-pytorch/monarch), [vLLM](https://github.com/vllm-project/vllm), and [torchtitan](https://github.com/pytorch/torchtitan). For convenience,
we have pre-packaged these dependencies as wheels in assets/wheels. (Note that the basic install script
torchforge requires the PyTorch 2.9.0 with [Monarch](https://github.com/meta-pytorch/monarch), [vLLM](https://github.com/vllm-project/vllm), and [torchtitan](https://github.com/pytorch/torchtitan). (Note that the basic install script
uses [DNF](https://docs.fedoraproject.org/en-US/quick-docs/dnf/), but could be easily extended to other Linux OS.)

torchforge requires the Github CLI (gh) to download a compatible vLLM package. See [here](https://github.com/cli/cli#installation) for gh install instructions before continuting. Please login to gh with your Github account before continuing with `gh auth login`. You may use either https or ssh as the protocol for authentication.

```bash
conda create -n forge python=3.10
conda create -n forge python=3.12
conda activate forge
./scripts/install.sh
```
Expand Down
6 changes: 2 additions & 4 deletions apps/grpo/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,17 +59,15 @@ def policy_version(self) -> int | None:

@property
def request_tensor(self) -> torch.Tensor:
request_tokens: torch.Tensor = self.completion.prompt_ids
tensor = torch.tensor(request_tokens, dtype=torch.long)
tensor: torch.Tensor = self.completion.prompt_ids.to(torch.long)
if tensor.shape[0] < self.request_len: # left pad
diff = self.request_len - tensor.shape[0]
tensor = F.pad(tensor, (diff, 0), value=self.pad_id)
return tensor

@property
def response_tensor(self) -> torch.Tensor:
response_tokens: torch.Tensor = self.completion.token_ids
tensor = torch.tensor(response_tokens, dtype=torch.long)
tensor: torch.Tensor = self.completion.token_ids.to(torch.long)
if tensor.shape[0] < self.response_len: # right pad
diff = self.response_len - tensor.shape[0]
tensor = F.pad(tensor, (0, diff), value=self.pad_id)
Expand Down
17 changes: 8 additions & 9 deletions assets/versions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,12 @@
# Version Configuration for Forge Wheel Building
# This file contains all pinned versions and commits for dependencies

# PyTorch version
PYTORCH_VERSION="2.9.0.dev20250905"
# Stable versions of upstream libraries for OSS repo
PYTORCH_VERSION="2.9.0"
VLLM_VERSION="v0.10.0"
MONARCH_VERSION="0.1.0rc8"
TORCHTITAN_VERSION="0.2.0"
TORCHSTORE_VERSION="0.0.1.rc3"

# vLLM branch
VLLM_BRANCH="v0.10.0"

# Commit hashes
MONARCH_COMMIT="195503223b5c2896846171f60ac99dc6868f8f2c"
TORCHTITAN_COMMIT="d0e25450bcac2332359b13fbda430dc701f073d4"
TORCHSTORE_COMMIT="662299faf4fd50ee30bd9aa3f4ce8c0e2db1d310"
# Torchtitan commit hash for launching on MAST
TORCHTITAN_COMMIT_MAST="d0e25450bcac2332359b13fbda430dc701f073d4"
Binary file not shown.
Binary file removed assets/wheels/torchstore-0.1.0-py3-none-any.whl
Binary file not shown.
Binary file removed assets/wheels/torchtitan-0.1.0-py3-none-any.whl
Binary file not shown.
Loading
Loading