diff --git a/.github/packaging/pre_build_cpu.sh b/.github/packaging/pre_build_cpu.sh index 9c5d7eb97..60913449d 100644 --- a/.github/packaging/pre_build_cpu.sh +++ b/.github/packaging/pre_build_cpu.sh @@ -24,7 +24,7 @@ echo "wheel dir is $WHL_DIR" build_vllm() { cd "$BUILD_DIR" - git clone https://github.com/vllm-project/vllm.git --branch $VLLM_BRANCH + git clone https://github.com/vllm-project/vllm.git --branch $VLLM_VERSION cd "$BUILD_DIR/vllm" python use_existing_torch.py diff --git a/.github/packaging/vllm_reqs_12_8.txt b/.github/packaging/vllm_reqs_12_8.txt index c7d38ec64..d1ba5e385 100644 --- a/.github/packaging/vllm_reqs_12_8.txt +++ b/.github/packaging/vllm_reqs_12_8.txt @@ -1,11 +1,4 @@ -# These requirements were generated by running steps 1-3 of scripts/build_wheels.sh -# then running pip freeze and manually removing the vllm dependency. -# The intention of this file is to use these known requirements for a fixed -# vLLM build to supplement a vLLM install from download.pytorch.org without -# resorting to --extra-index-url https://download.pytorch.org/whl/nightly to find -# vLLM dependencies (as this results in a ResolutionTooDeep error from pip). -# See the file .github/workflows/gpu_test.yaml for an E2E forge installation using this approach. -# TODO: this should be done way less hackily +# This file was generated by running ./scripts/generate_vllm_reqs.sh aiohappyeyeballs==2.6.1 aiohttp==3.13.1 aiosignal==1.4.0 @@ -33,8 +26,8 @@ dnspython==2.8.0 einops==0.8.1 email-validator==2.3.0 exceptiongroup==1.3.0 -fastapi==0.119.0 -fastapi-cli==0.0.13 +fastapi==0.119.1 +fastapi-cli==0.0.14 fastapi-cloud-cli==0.3.1 fastrlock==0.8.3 filelock==3.19.1 @@ -94,7 +87,7 @@ prometheus-fastapi-instrumentator==7.1.0 prometheus_client==0.23.1 propcache==0.4.1 protobuf==6.33.0 -psutil==7.1.0 +psutil==7.1.1 py-cpuinfo==9.0.0 pybase64==1.4.2 pycountry==24.6.1 @@ -108,9 +101,9 @@ python-json-logger==4.0.0 python-multipart==0.0.20 PyYAML==6.0.3 pyzmq==27.1.0 -ray==2.50.0 +ray==2.50.1 referencing==0.37.0 -regex==2025.9.18 +regex==2025.10.23 requests==2.32.5 rich==14.2.0 rich-toolkit==0.15.1 @@ -119,8 +112,8 @@ rpds-py==0.27.1 safetensors==0.6.2 scipy==1.15.3 sentencepiece==0.2.1 -sentry-sdk==2.42.0 -setuptools-scm==9.2.1 +sentry-sdk==2.42.1 +setuptools-scm==9.2.2 shellingham==1.5.4 sniffio==1.3.1 soundfile==0.13.1 @@ -134,11 +127,11 @@ torch==2.9.0+cu128 tqdm==4.67.1 transformers==4.57.1 triton==3.5.0 -typer==0.19.2 +typer==0.20.0 typing-inspection==0.4.2 typing_extensions==4.15.0 urllib3==2.5.0 -uvicorn==0.37.0 +uvicorn==0.38.0 uvloop==0.22.1 watchfiles==1.1.1 websockets==15.0.1 diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 6d3c35669..88b8446e3 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -26,25 +26,11 @@ jobs: activate-environment: test python-version: '3.10' auto-activate: false - - name: Verify conda environment - shell: bash -l {0} - run: | - conda info - which python - which conda - name: Update pip shell: bash -l {0} run: python -m pip install --upgrade pip - - name: Install pytorch - shell: bash -l {0} - run: pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu130 --force-reinstall - - name: Install monarch - shell: bash -l {0} - run: pip install assets/ci/monarch_no_torch-0.1.0.dev20251010-py3-none-any.whl - name: Install torchforge shell: bash -l {0} - env: - GH_TOKEN: ${{ github.token }} run: ./scripts/install.sh - name: Install docs dependencies shell: bash -l {0} @@ -52,36 +38,7 @@ jobs: - name: Build docs shell: bash -l {0} working-directory: docs - run: | - # Set up library paths to ensure all dependencies are available - # This is critical for monarch and other native dependencies that need libpython3.10.so.1.0 - export LD_LIBRARY_PATH="${CONDA_PREFIX}/lib:${LD_LIBRARY_PATH:-}" - - # Also set CUDA paths if needed - if [ -d "/usr/local/cuda-12.9" ]; then - export LD_LIBRARY_PATH="/usr/local/cuda-12.9/compat:${LD_LIBRARY_PATH}" - export CUDA_HOME=/usr/local/cuda-12.9 - fi - - # Verify dependencies can be imported before building docs - echo "Verifying dependencies..." - python -c "import forge; print('✓ torchforge imported successfully')" - python -c "import monarch; print('✓ monarch imported successfully')" - - # Build docs with -W (warnings as errors) and --keep-going to see all issues - # Capture exit code but continue to see all errors - set +e - make html SPHINXOPTS="-W --keep-going" - BUILD_EXIT_CODE=$? - set -e - - # Report results - if [ $BUILD_EXIT_CODE -ne 0 ]; then - echo "❌ Documentation build failed with warnings or errors (exit code: $BUILD_EXIT_CODE)" - exit $BUILD_EXIT_CODE - else - echo "✅ Documentation build completed successfully with no warnings or errors" - fi + run: make html - name: Upload docs artifact uses: actions/upload-artifact@v4 with: diff --git a/.github/workflows/gpu_test.yaml b/.github/workflows/gpu_test.yaml index 301ac79f5..e9cafeebc 100644 --- a/.github/workflows/gpu_test.yaml +++ b/.github/workflows/gpu_test.yaml @@ -1,9 +1,6 @@ -name: GPU Tests +name: Unit Tests (GPU) on: - schedule: - # Runs at midnight every day - - cron: '0 0 * * *' push: branches: [ main ] pull_request: @@ -27,7 +24,7 @@ jobs: runs-on: linux.g5.12xlarge.nvidia.gpu strategy: matrix: - python-version: ['3.10'] + python-version: ['3.10', '3.11', '3.12'] steps: - name: Check out repo uses: actions/checkout@v4 @@ -40,26 +37,10 @@ jobs: python-version: ${{ matrix.python-version }} - name: Update pip run: python -m pip install --upgrade pip - - name: Install pinned torch nightly - run: python -m pip install --pre torch==2.9.0.dev20250905 --no-cache-dir --index-url https://download.pytorch.org/whl/nightly/cu129 - - name: Download and install vLLM and its dependencies - # TODO: this honestly could not be hackier if I tried - run: | - python -m pip install -r .github/packaging/vllm_reqs_12_9.txt - python -m pip install vllm==0.10.1.dev0+g6d8d0a24c.d20251009.cu129 --no-cache-dir --index-url https://download.pytorch.org/whl/preview/forge - - name: Install Monarch - run: pip install torchmonarch==0.1.0rc1 - - name: Install torchtitan and torchstore - run: | - python -m pip install git+https://github.com/pytorch/torchtitan.git - python -m pip install git+https://github.com/meta-pytorch/torchstore.git - - name: Install dependencies - run: python -m pip install --no-build-isolation -e ".[dev]" + - name: Install torchforge + run: ./scripts/install.sh - name: Run unit tests with coverage # TODO add all tests - run: | - export LD_PRELOAD=$CONDA/envs/test/lib/libpython3.10.so.1.0 - export LD_LIBRARY_PATH=$CONDA/envs/test/lib/libpython3.10.so.1.0 - pytest tests/unit_tests --cov=. --cov-report=xml --durations=20 -vv + run: pytest tests/unit_tests --cov=. --cov-report=xml --durations=20 -vv - name: Upload Coverage to Codecov uses: codecov/codecov-action@v3 diff --git a/.github/workflows/unit_test.yaml b/.github/workflows/unit_test.yaml deleted file mode 100644 index 73063620f..000000000 --- a/.github/workflows/unit_test.yaml +++ /dev/null @@ -1,44 +0,0 @@ -name: Unit Tests - -on: - pull_request: - push: - branches: [ main ] - workflow_dispatch: - -jobs: - unit_tests: - runs-on: ubuntu-latest - timeout-minutes: 15 - strategy: - matrix: - python-version: ['3.10', '3.11', '3.12'] - steps: - - name: Check out repo - uses: actions/checkout@v4 - - name: Setup conda env - uses: conda-incubator/setup-miniconda@v2 - with: - auto-update-conda: true - miniconda-version: "latest" - activate-environment: test - python-version: ${{ matrix.python-version }} - - name: Update pip - run: python -m pip install --upgrade pip - - name: Install pytorch - run: python -m pip install torch==2.9.0.dev20250826 --extra-index-url https://download.pytorch.org/whl/nightly/cpu - - name: Install monarch - run: pip install assets/ci/monarch_no_torch-0.1.0.dev20251010-py3-none-any.whl - - name: Install torchstore - run: pip install assets/wheels/torchstore-0.1.0-py3-none-any.whl - - name: Install torchtitan - run: | - pip install assets/wheels/torchtitan-0.1.0-py3-none-any.whl - pip install tyro - - name: Install dependencies - run: python -m pip install --no-build-isolation -e ".[dev]" - - name: Run unit tests with coverage - # TODO add all tests - run: pytest tests/unit_tests --cov=. --cov-report=xml --durations=20 -vv - - name: Upload Coverage to Codecov - uses: codecov/codecov-action@v3 diff --git a/.meta/mast/env_setup.sh b/.meta/mast/env_setup.sh index 323e2febe..fca86adf5 100755 --- a/.meta/mast/env_setup.sh +++ b/.meta/mast/env_setup.sh @@ -150,10 +150,10 @@ if [ -f "$VERSIONS_FILE" ]; then log_info "Sourcing version information from: $VERSIONS_FILE" source "$VERSIONS_FILE" - if [ -n "$TORCHTITAN_COMMIT" ]; then - log_info "Installing torchtitan from commit: $TORCHTITAN_COMMIT" + if [ -n "$TORCHTITAN_COMMIT_MAST" ]; then + log_info "Installing torchtitan from commit: $TORCHTITAN_COMMIT_MAST" pip uninstall -y torchtitan - pip install "git+https://github.com/pytorch/torchtitan.git@$TORCHTITAN_COMMIT" + pip install "git+https://github.com/pytorch/torchtitan.git@$TORCHTITAN_COMMIT_MAST" if [ $? -eq 0 ]; then log_info "Torchtitan installed successfully" @@ -162,7 +162,7 @@ if [ -f "$VERSIONS_FILE" ]; then exit 1 fi else - log_error "TORCHTITAN_COMMIT not found in versions.sh" + log_error "TORCHTITAN_COMMIT_MAST not found in versions.sh" exit 1 fi else diff --git a/README.md b/README.md index 5511bc611..18dd34b50 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,6 @@ # image torchforge #### A PyTorch-native agentic RL library that lets you focus on algorithms—not infra. -[![Unit Tests](https://github.com/meta-pytorch/forge/actions/workflows/unit_test.yaml/badge.svg?branch=main)](https://github.com/meta-pytorch/forge/actions/workflows/unit_test.yaml?query=branch%3Amain) [![GPU Tests](https://github.com/meta-pytorch/forge/actions/workflows/gpu_test.yaml/badge.svg?branch=main)](https://github.com/meta-pytorch/forge/actions/workflows/gpu_test.yaml?query=branch%3Amain) ## Overview @@ -31,14 +30,11 @@ You can also find our notebook tutorials (coming soon) ### Basic -torchforge requires the latest PyTorch nightly with [Monarch](https://github.com/meta-pytorch/monarch), [vLLM](https://github.com/vllm-project/vllm), and [torchtitan](https://github.com/pytorch/torchtitan). For convenience, -we have pre-packaged these dependencies as wheels in assets/wheels. (Note that the basic install script +torchforge requires PyTorch 2.9.0 with [Monarch](https://github.com/meta-pytorch/monarch), [vLLM](https://github.com/vllm-project/vllm), and [torchtitan](https://github.com/pytorch/torchtitan). (Note that the basic install script uses [DNF](https://docs.fedoraproject.org/en-US/quick-docs/dnf/), but could be easily extended to other Linux OS.) -torchforge requires the Github CLI (gh) to download a compatible vLLM package. See [here](https://github.com/cli/cli#installation) for gh install instructions before continuting. Please login to gh with your Github account before continuing with `gh auth login`. You may use either https or ssh as the protocol for authentication. - ```bash -conda create -n forge python=3.10 +conda create -n forge python=3.12 conda activate forge ./scripts/install.sh ``` @@ -51,11 +47,6 @@ After install, you can run the following command and should see output confirmin python -m apps.grpo.main --config apps/grpo/qwen3_1_7b.yaml ``` -If you need to re-build the wheels for whatever reason, you can do so with: -```bash -./scripts/build_wheels.sh -``` - ## Quick Start To run SFT on a Llama3 8B model, run diff --git a/apps/grpo/main.py b/apps/grpo/main.py index ef522e57b..db3b9655f 100644 --- a/apps/grpo/main.py +++ b/apps/grpo/main.py @@ -59,8 +59,7 @@ def policy_version(self) -> int | None: @property def request_tensor(self) -> torch.Tensor: - request_tokens: torch.Tensor = self.completion.prompt_ids - tensor = torch.tensor(request_tokens, dtype=torch.long) + tensor: torch.Tensor = self.completion.prompt_ids.to(torch.long) if tensor.shape[0] < self.request_len: # left pad diff = self.request_len - tensor.shape[0] tensor = F.pad(tensor, (diff, 0), value=self.pad_id) @@ -68,8 +67,7 @@ def request_tensor(self) -> torch.Tensor: @property def response_tensor(self) -> torch.Tensor: - response_tokens: torch.Tensor = self.completion.token_ids - tensor = torch.tensor(response_tokens, dtype=torch.long) + tensor: torch.Tensor = self.completion.token_ids.to(torch.long) if tensor.shape[0] < self.response_len: # right pad diff = self.response_len - tensor.shape[0] tensor = F.pad(tensor, (0, diff), value=self.pad_id) diff --git a/assets/versions.sh b/assets/versions.sh index 49a755dc0..ff7aff5cd 100644 --- a/assets/versions.sh +++ b/assets/versions.sh @@ -7,13 +7,12 @@ # Version Configuration for Forge Wheel Building # This file contains all pinned versions and commits for dependencies -# PyTorch version -PYTORCH_VERSION="2.9.0.dev20250905" +# Stable versions of upstream libraries for OSS repo +PYTORCH_VERSION="2.9.0" +VLLM_VERSION="v0.10.0" +MONARCH_VERSION="0.1.0rc8" +TORCHTITAN_VERSION="0.2.0" +TORCHSTORE_VERSION="0.0.1.rc3" -# vLLM branch -VLLM_BRANCH="v0.10.0" - -# Commit hashes -MONARCH_COMMIT="195503223b5c2896846171f60ac99dc6868f8f2c" -TORCHTITAN_COMMIT="d0e25450bcac2332359b13fbda430dc701f073d4" -TORCHSTORE_COMMIT="662299faf4fd50ee30bd9aa3f4ce8c0e2db1d310" +# Torchtitan commit hash for launching on MAST +TORCHTITAN_COMMIT_MAST="d0e25450bcac2332359b13fbda430dc701f073d4" diff --git a/assets/wheels/monarch-0.0.1-cp310-cp310-linux_x86_64.whl b/assets/wheels/monarch-0.0.1-cp310-cp310-linux_x86_64.whl deleted file mode 100644 index a704f8703..000000000 Binary files a/assets/wheels/monarch-0.0.1-cp310-cp310-linux_x86_64.whl and /dev/null differ diff --git a/assets/wheels/torchstore-0.1.0-py3-none-any.whl b/assets/wheels/torchstore-0.1.0-py3-none-any.whl deleted file mode 100644 index 2a958c709..000000000 Binary files a/assets/wheels/torchstore-0.1.0-py3-none-any.whl and /dev/null differ diff --git a/assets/wheels/torchtitan-0.1.0-py3-none-any.whl b/assets/wheels/torchtitan-0.1.0-py3-none-any.whl deleted file mode 100644 index cc61d6db1..000000000 Binary files a/assets/wheels/torchtitan-0.1.0-py3-none-any.whl and /dev/null differ diff --git a/scripts/build_wheels.sh b/scripts/build_wheels.sh deleted file mode 100755 index 407d17e6f..000000000 --- a/scripts/build_wheels.sh +++ /dev/null @@ -1,403 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -#!/bin/bash -set -euo pipefail - -# Colors for output -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -BLUE='\033[0;34m' -NC='\033[0m' - -# Source version configuration -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -VERSIONS_FILE="$SCRIPT_DIR/../assets/versions.sh" - -if [ ! -f "$VERSIONS_FILE" ]; then - echo -e "${RED}[ERROR]${NC} Versions file not found: $VERSIONS_FILE" - exit 1 -fi - -source "$VERSIONS_FILE" - -# Validate required variables are set -validate_versions() { - local missing_vars=() - - [ -z "${PYTORCH_VERSION:-}" ] && missing_vars+=("PYTORCH_VERSION") - [ -z "${VLLM_BRANCH:-}" ] && missing_vars+=("VLLM_BRANCH") - [ -z "${MONARCH_COMMIT:-}" ] && missing_vars+=("MONARCH_COMMIT") - [ -z "${TORCHTITAN_COMMIT:-}" ] && missing_vars+=("TORCHTITAN_COMMIT") - [ -z "${TORCHSTORE_COMMIT:-}" ] && missing_vars+=("TORCHSTORE_COMMIT") - - if [ ${#missing_vars[@]} -gt 0 ]; then - echo -e "${RED}[ERROR]${NC} Missing required variables in $VERSIONS_FILE:" - for var in "${missing_vars[@]}"; do - echo " - $var" - done - exit 1 - fi -} - -validate_versions - -# Configuration -BUILD_DIR="$HOME/forge-build" -WHEEL_DIR="$(pwd)/assets/wheels" - -# Logging functions -log_info() { echo -e "${GREEN}[INFO]${NC} $1"; } -log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; } -log_error() { echo -e "${RED}[ERROR]${NC} $1"; } -log_step() { echo -e "${BLUE}[STEP]${NC} $1"; } - -# Function to handle step failures -handle_failure() { - local step_name="$1" - local exit_code="$2" - - log_error "Step failed: $step_name" - log_error "Exit code: $exit_code" - log_error "Working directory: $(pwd)" - echo "" - exit $exit_code -} - -# Validation functions -check_conda_env() { - if [ -z "${CONDA_DEFAULT_ENV:-}" ]; then - log_error "Not running in a conda environment" - log_info "Please create and activate your conda environment first:" - log_info " conda create -n forge python=3.10 -y" - log_info " conda activate forge" - exit 1 - fi - log_info "Running in conda environment: $CONDA_DEFAULT_ENV" -} - -check_command() { - if ! command -v "$1" &> /dev/null; then - log_error "Required command '$1' not found" - exit 1 - fi -} - -check_sudo() { - if ! sudo -n true 2>/dev/null; then - log_error "This script requires passwordless sudo access" - log_info "Run 'sudo -v' first, or configure passwordless sudo" - exit 1 - fi -} - -check_disk_space() { - local required_gb=10 - local available_gb=$(df ~/ --output=avail -BG | tail -1 | sed 's/G//') - if [ "$available_gb" -lt "$required_gb" ]; then - log_error "Insufficient disk space. Need ${required_gb}GB, have ${available_gb}GB" - exit 1 - fi -} - -# Main validation -validate_environment() { - log_info "Validating environment..." - - check_conda_env - check_command git - check_command curl - check_command python - check_command pip - check_command conda - check_sudo - check_disk_space - - # Check if CUDA toolkit will be available - if ! ldconfig -p | grep -q cuda; then - log_warn "CUDA libraries not found in ldconfig. Will attempt to install CUDA toolkit." - fi - - log_info "Environment validation passed" -} - -# Setup build directory and wheels directory -setup_build_dir() { - log_info "Setting up build directory: $BUILD_DIR" - mkdir -p "$BUILD_DIR" - log_info "Setting up wheels directory: $WHEEL_DIR" - mkdir -p "$WHEEL_DIR" - log_info "Build and wheels directories created" -} - -# Setup CUDA environment variables -setup_cuda_env() { - log_info "Setting up CUDA environment..." - - export CUDA_VERSION=12.9 - export NVCC=/usr/local/cuda-${CUDA_VERSION}/bin/nvcc - export CUDA_NVCC_EXECUTABLE=/usr/local/cuda-${CUDA_VERSION}/bin/nvcc - export CUDA_HOME=/usr/local/cuda-${CUDA_VERSION} - export PATH="${CUDA_HOME}/bin:$PATH" - export CUDA_INCLUDE_DIRS=$CUDA_HOME/include - export CUDA_CUDART_LIBRARY=$CUDA_HOME/lib64/libcudart.so - export LD_LIBRARY_PATH=/usr/local/cuda-12.9/compat:${LD_LIBRARY_PATH:-} - export LIBRARY_PATH=$CUDA_HOME/lib64:${LIBRARY_PATH:-} - - # Save to file for persistence - cat > ~/.forge_cuda_env << 'EOF' -export CUDA_VERSION=12.9 -export NVCC=/usr/local/cuda-${CUDA_VERSION}/bin/nvcc -export CUDA_NVCC_EXECUTABLE=/usr/local/cuda-${CUDA_VERSION}/bin/nvcc -export CUDA_HOME=/usr/local/cuda-${CUDA_VERSION} -export PATH="${CUDA_HOME}/bin:$PATH" -export CUDA_INCLUDE_DIRS=$CUDA_HOME/include -export CUDA_CUDART_LIBRARY=$CUDA_HOME/lib64/libcudart.so -export LD_LIBRARY_PATH=/usr/local/cuda-12.9/compat:${LD_LIBRARY_PATH:-} -export LIBRARY_PATH=${CUDA_HOME}/lib64:${LIBRARY_PATH:-} -EOF - - log_info "CUDA environment configured" -} - -# Parse command line arguments -BUILD_TARGETS=() - -while [[ $# -gt 0 ]]; do - case $1 in - vllm|monarch|torchtitan|torchstore) - BUILD_TARGETS+=("$1") - shift - ;; - --help|-h) - echo "Usage: $0 [TARGETS...]" - echo "" - echo "Build wheels for Forge dependencies." - echo "" - echo "Targets (default: all):" - echo " vllm Build vLLM wheel" - echo " monarch Build Monarch wheel" - echo " torchtitan Build torchtitan wheel" - echo " torchstore Build torchstore wheel" - echo "" - echo "Examples:" - echo " $0 # Build all wheels" - echo " $0 vllm # Build only vLLM" - echo " $0 monarch torchtitan # Build Monarch and torchtitan" - exit 0 - ;; - *) - log_error "Unknown argument: $1" - log_info "Use --help to see available options" - exit 1 - ;; - esac -done - -# If no targets specified, build all -if [ ${#BUILD_TARGETS[@]} -eq 0 ]; then - BUILD_TARGETS=("vllm" "monarch" "torchtitan" "torchstore") - log_info "No targets specified, building all wheels" -else - log_info "Building wheels: ${BUILD_TARGETS[*]}" -fi - -# Helper function to check if a target should be built -should_build() { - local target="$1" - for t in "${BUILD_TARGETS[@]}"; do - if [ "$t" == "$target" ]; then - return 0 - fi - done - return 1 -} - -# Step execution wrapper -run_step() { - local step_name="$1" - local step_function="$2" - - log_step "$step_name" - - if ! $step_function; then - handle_failure "$step_name" "$?" - fi -} - -# Step 1: Install PyTorch nightly -step1_pytorch() { - pip3 install --pre torch==$PYTORCH_VERSION --index-url https://download.pytorch.org/whl/nightly/cu129 -} - -# Step 2: Install CUDA system packages -step2_cuda_packages() { - sudo dnf install -y cuda-toolkit-12-9 cuda-compat-12-9 - setup_cuda_env -} - -# Step 3: Build vLLM wheel -step3_vllm() { - log_info "Building vLLM from branch: $VLLM_BRANCH (from $VERSIONS_FILE)" - cd "$BUILD_DIR" - if [ -d "vllm" ]; then - log_warn "vLLM directory exists, removing..." - rm -rf vllm - fi - - git clone https://github.com/vllm-project/vllm.git --branch $VLLM_BRANCH - cd "$BUILD_DIR/vllm" - - python use_existing_torch.py - pip install -r requirements/build.txt - pip wheel --no-build-isolation --no-deps . -w "$WHEEL_DIR" -} - -# Step 4: Setup Rust toolchain -step4_rust_setup() { - # Install Rust if not present - if ! command -v rustup &> /dev/null; then - curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y - source ~/.cargo/env - fi - - rustup toolchain install nightly - rustup default nightly - - # Install additional system packages - conda install -y libunwind - sudo dnf install -y clang-devel libnccl-devel - sudo dnf install -y libibverbs rdma-core libmlx5 libibverbs-devel rdma-core-devel -} - -# Step 5: Build Monarch wheel -step5_monarch() { - log_info "Building Monarch from commit: $MONARCH_COMMIT (from $VERSIONS_FILE)" - cd "$BUILD_DIR" - if [ -d "monarch" ]; then - log_warn "Monarch directory exists, removing..." - rm -rf monarch - fi - - git clone https://github.com/meta-pytorch/monarch.git - cd "$BUILD_DIR/monarch" - git checkout $MONARCH_COMMIT - - pip install -r build-requirements.txt - pip wheel --no-build-isolation --no-deps . -w "$WHEEL_DIR" -} - -# Step 6: Build torchtitan wheel -step6_torchtitan() { - log_info "Building torchtitan from commit: $TORCHTITAN_COMMIT (from $VERSIONS_FILE)" - cd "$BUILD_DIR" - if [ -d "torchtitan" ]; then - log_warn "torchtitan directory exists, removing..." - rm -rf torchtitan - fi - - git clone https://github.com/pytorch/torchtitan.git - cd "$BUILD_DIR/torchtitan" - git checkout $TORCHTITAN_COMMIT - - pip wheel --no-deps . -w "$WHEEL_DIR" -} - -# Step 7: Build torchstore wheel -step7_torchstore() { - log_info "Building torchstore from commit: $TORCHSTORE_COMMIT (from $VERSIONS_FILE)" - cd "$BUILD_DIR" - if [ -d "torchstore" ]; then - log_warn "torchstore directory exists, removing..." - rm -rf torchstore - fi - - git clone https://github.com/meta-pytorch/torchstore.git - cd "$BUILD_DIR/torchstore" - git checkout $TORCHSTORE_COMMIT - - pip wheel --no-deps . -w "$WHEEL_DIR" -} - -# Verification -verify_installation() { - log_info "Verifying wheel builds..." - - python -c "import torch; print(f'PyTorch {torch.__version__} (CUDA: {torch.cuda.is_available()})')" - - # Check that wheels were created - wheel_count=$(ls -1 "$WHEEL_DIR"/*.whl 2>/dev/null | wc -l) - if [ "$wheel_count" -gt 0 ]; then - log_info "Built $wheel_count wheels:" - ls -1 "$WHEEL_DIR"/*.whl | sed 's/.*\// /' - else - log_error "No wheels found in $WHEEL_DIR" - return 1 - fi - - log_info "Wheel building verification complete!" -} - -# Main execution -main() { - echo "Forge Wheel Builder" - echo "===================" - echo "" - - validate_environment - setup_build_dir - - # PyTorch is needed for all builds - run_step "Installing PyTorch nightly" step1_pytorch - - # CUDA packages are needed for vLLM and Monarch - if should_build "vllm" || should_build "monarch"; then - run_step "Installing CUDA packages and setting environment" step2_cuda_packages - fi - - # Build requested wheels - if should_build "vllm"; then - run_step "Building vLLM wheel" step3_vllm - fi - - # Rust setup is needed for Monarch - if should_build "monarch"; then - run_step "Setting up Rust toolchain and additional packages" step4_rust_setup - run_step "Building Monarch wheel" step5_monarch - fi - - if should_build "torchtitan"; then - run_step "Building torchtitan wheel" step6_torchtitan - fi - - if should_build "torchstore"; then - run_step "Building torchstore wheel" step7_torchstore - fi - - verify_installation - - echo "" - log_info "Wheel building completed successfully!" - log_info "" - log_info "Built wheels are in: $WHEEL_DIR" - log_info "" - log_info "Users can now install with:" - log_info " conda create -n forge python=3.10 -y" - log_info " conda activate forge" - log_info " pip install torch==$PYTORCH_VERSION --index-url https://download.pytorch.org/whl/nightly/cu129" - log_info " pip install $WHEEL_DIR/*.whl" - if should_build "vllm" || should_build "monarch"; then - log_info " source ~/.forge_cuda_env" - fi - log_info "" - log_info "Build artifacts are in: $BUILD_DIR" - log_info "You can remove them with: rm -rf $BUILD_DIR" -} - - -# Run main function -main "$@" diff --git a/scripts/generate_vllm_reqs.sh b/scripts/generate_vllm_reqs.sh new file mode 100755 index 000000000..6da96c200 --- /dev/null +++ b/scripts/generate_vllm_reqs.sh @@ -0,0 +1,183 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +#!/bin/bash +set -euo pipefail + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' + +# Source version configuration +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +VERSIONS_FILE="$SCRIPT_DIR/../assets/versions.sh" + +if [ ! -f "$VERSIONS_FILE" ]; then + echo -e "${RED}[ERROR]${NC} Versions file not found: $VERSIONS_FILE" + exit 1 +fi + +source "$VERSIONS_FILE" + +# Configuration +BUILD_DIR="$HOME/forge-build" +WHEEL_DIR="$(pwd)/assets/wheels" + +# Logging functions +log_info() { echo -e "${GREEN}[INFO]${NC} $1"; } +log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; } +log_error() { echo -e "${RED}[ERROR]${NC} $1"; } + + +# Validation functions +check_conda_env() { + if [ -z "${CONDA_DEFAULT_ENV:-}" ]; then + log_error "Not running in a conda environment" + log_info "Please create and activate your conda environment first:" + log_info " conda create -n forge python=3.10 -y" + log_info " conda activate forge" + exit 1 + fi + log_info "Running in conda environment: $CONDA_DEFAULT_ENV" +} + +check_command() { + if ! command -v "$1" &> /dev/null; then + log_error "Required command '$1' not found" + exit 1 + fi +} + +check_sudo() { + if ! sudo -n true 2>/dev/null; then + log_error "This script requires passwordless sudo access" + log_info "Run 'sudo -v' first, or configure passwordless sudo" + exit 1 + fi +} + +check_disk_space() { + local required_gb=10 + local available_gb=$(df ~/ --output=avail -BG | tail -1 | sed 's/G//') + if [ "$available_gb" -lt "$required_gb" ]; then + log_error "Insufficient disk space. Need ${required_gb}GB, have ${available_gb}GB" + exit 1 + fi +} + +# Main validation +validate_environment() { + log_info "Validating environment..." + + check_conda_env + check_command git + check_command curl + check_command python + check_command pip + check_command conda + check_sudo + check_disk_space + + # Check if CUDA toolkit will be available + if ! ldconfig -p | grep -q cuda; then + log_warn "CUDA libraries not found in ldconfig. Will attempt to install CUDA toolkit." + fi + + log_info "Environment validation passed" +} + +# Setup build directory and wheels directory +setup_build_dir() { + log_info "Setting up build directory: $BUILD_DIR" + mkdir -p "$BUILD_DIR" + log_info "Setting up wheels directory: $WHEEL_DIR" + mkdir -p "$WHEEL_DIR" + log_info "Build and wheels directories created" +} + +# Setup CUDA environment variables +setup_cuda_env() { + log_info "Setting up CUDA environment..." + + export CUDA_VERSION=12.8 + export NVCC=/usr/local/cuda-${CUDA_VERSION}/bin/nvcc + export CUDA_NVCC_EXECUTABLE=/usr/local/cuda-${CUDA_VERSION}/bin/nvcc + export CUDA_HOME=/usr/local/cuda-${CUDA_VERSION} + export PATH="${CUDA_HOME}/bin:$PATH" + export CUDA_INCLUDE_DIRS=$CUDA_HOME/include + export CUDA_CUDART_LIBRARY=$CUDA_HOME/lib64/libcudart.so + export LD_LIBRARY_PATH=/usr/local/cuda-12.8/compat:${LD_LIBRARY_PATH:-} + export LIBRARY_PATH=$CUDA_HOME/lib64:${LIBRARY_PATH:-} + + # Save to file for persistence + cat > ~/.forge_cuda_env << 'EOF' +export CUDA_VERSION=12.8 +export NVCC=/usr/local/cuda-${CUDA_VERSION}/bin/nvcc +export CUDA_NVCC_EXECUTABLE=/usr/local/cuda-${CUDA_VERSION}/bin/nvcc +export CUDA_HOME=/usr/local/cuda-${CUDA_VERSION} +export PATH="${CUDA_HOME}/bin:$PATH" +export CUDA_INCLUDE_DIRS=$CUDA_HOME/include +export CUDA_CUDART_LIBRARY=$CUDA_HOME/lib64/libcudart.so +export LD_LIBRARY_PATH=/usr/local/cuda-12.8/compat:${LD_LIBRARY_PATH:-} +export LIBRARY_PATH=${CUDA_HOME}/lib64:${LIBRARY_PATH:-} +EOF + + log_info "CUDA environment configured" +} + +# Step 1: Install PyTorch stable +step1_pytorch() { + pip3 install --pre torch==$PYTORCH_VERSION --index-url https://download.pytorch.org/whl/cu128 +} + +# Step 2: Install CUDA system packages +step2_cuda_packages() { + sudo dnf install -y cuda-toolkit-12-8 cuda-compat-12-8 + setup_cuda_env +} + +# Step 3: Build vLLM wheel +step3_vllm() { + log_info "Building vLLM from branch: $VLLM_VERSION (from $VERSIONS_FILE)" + cd "$BUILD_DIR" + if [ -d "vllm" ]; then + log_warn "vLLM directory exists, removing..." + rm -rf vllm + fi + + git clone https://github.com/vllm-project/vllm.git --branch $VLLM_VERSION + cd "$BUILD_DIR/vllm" + + python use_existing_torch.py + pip install -r requirements/build.txt + pip install --no-build-isolation -e . +} + +# Main execution +main() { + echo "Forge Wheel Builder" + echo "===================" + echo "" + + validate_environment + setup_build_dir + + # Install PyTorch, CUDA packages, and vLLM + step1_pytorch + step2_cuda_packages + step3_vllm + + # Output requirements to .github/packaging/vllm_reqs_12_8.txt + REQS_FILE="$SCRIPT_DIR/../.github/packaging/vllm_reqs_12_8.txt" + pip freeze | grep -v "vllm*" > $REQS_FILE + sed -i '1i# This file was generated by running ./scripts/generate_vllm_reqs.sh' $REQS_FILE +} + + +# Run main function +main "$@" diff --git a/scripts/install.sh b/scripts/install.sh index 30dcf1ff5..022d40648 100755 --- a/scripts/install.sh +++ b/scripts/install.sh @@ -138,85 +138,6 @@ install_system_packages() { fi } -# Check to see if gh is installed, if not, it will be installed via conda-forge channel -check_gh_install() { - if ! command -v gh &> /dev/null; then - log_warning "GitHub CLI (gh) not found. Installing via Conda..." - conda install gh --channel conda-forge -y - log_info "GitHub CLI (gh) installed successfully." - log_info "Please run 'gh auth login' to authenticate with GitHub." - else - log_info "GitHub CLI (gh) already installed." - fi -} - -# Check wheels exist -check_wheels() { - if [ ! -d "$WHEEL_DIR" ]; then - log_error "Wheels directory not found: $WHEEL_DIR" - exit 1 - fi - - local wheel_count=$(ls -1 "$WHEEL_DIR"/*.whl 2>/dev/null | wc -l) - log_info "Found $wheel_count local wheels" -} - -# Download vLLM wheel from GitHub releases -download_vllm_wheel() { - log_info "Downloading vLLM wheel from GitHub releases..." - - # Check if gh is installed - if ! command -v gh &> /dev/null; then - log_error "GitHub CLI (gh) is required to download vLLM wheel" - log_info "Install it with: sudo dnf install gh" - log_info "Then run: gh auth login" - exit 1 - fi - - # Get the vLLM wheel filename from the release - local vllm_wheel_name - vllm_wheel_name=$(gh release view "$RELEASE_TAG" --repo "$GITHUB_REPO" --json assets --jq '.assets[] | select(.name | contains("vllm")) | .name' | head -1) - - if [ -z "$vllm_wheel_name" ]; then - log_error "Could not find vLLM wheel in release $RELEASE_TAG" - log_info "Make sure the vLLM wheel has been uploaded to the GitHub release" - exit 1 - fi - for f in assets/wheels/vllm-*; do - [ -e "$f" ] || continue # skip if glob didn't match - if [ "$(basename "$f")" != "$vllm_wheel_name" ]; then - log_info "Removing stale vLLM wheel: $(basename "$f")" - rm -f "$f" - fi - done - - local local_path="$WHEEL_DIR/$vllm_wheel_name" - - if [ -f "$local_path" ]; then - log_info "vLLM wheel already downloaded: $vllm_wheel_name" - return 0 - fi - - log_info "Downloading: $vllm_wheel_name" - - # Save current directory and change to wheel directory - local original_dir=$(pwd) - cd "$WHEEL_DIR" - gh release download "$RELEASE_TAG" --repo "$GITHUB_REPO" --pattern "*vllm*" - local download_result=$? - - # Always return to original directory - cd "$original_dir" - - if [ $download_result -eq 0 ]; then - log_info "Successfully downloaded vLLM wheel" - else - log_error "Failed to download vLLM wheel" - exit 1 - fi -} - - # Parse command line arguments parse_args() { USE_SUDO=false @@ -255,7 +176,6 @@ main() { echo "======================" echo "" echo "Note: Run this from the root of the forge repository" - echo "This script requires GitHub CLI (gh) to download large wheels" if [ "$USE_SUDO" = "true" ]; then echo "System packages will be installed via system package manager (requires sudo)" check_sudo @@ -264,24 +184,29 @@ main() { fi echo "" - check_conda_env - check_wheels - # Install openssl as we overwrite the default version when we update LD_LIBRARY_PATH conda install -y openssl install_system_packages "$USE_SUDO" - check_gh_install - download_vllm_wheel - log_info "Installing PyTorch nightly..." - pip install torch==$PYTORCH_VERSION --index-url https://download.pytorch.org/whl/nightly/cu129 + log_info "Installing PyTorch ..." + pip install torch==$PYTORCH_VERSION --index-url https://download.pytorch.org/whl/cu128 + + # Install vLLM and its requirements + pip install -r .github/packaging/vllm_reqs_12_8.txt + pip install six + pip install "setuptools<80" + python -m pip install vllm --no-cache-dir --index-url https://download.pytorch.org/whl/preview/forge + + # Install monarch + pip install torchmonarch==$MONARCH_VERSION - log_info "Installing all wheels (local + downloaded)..." - pip install "$WHEEL_DIR"/*.whl + # Install torchtitan and torchstore + pip install torchtitan==$TORCHTITAN_VERSION + pip install torchstore==$TORCHSTORE_VERSION log_info "Installing Forge from source..." - pip install -e . + pip install -e ".[dev]" # Set up environment log_info "Setting up environment..." @@ -301,7 +226,7 @@ main() { local cuda_activation_script="${conda_env_dir}/etc/conda/activate.d/cuda_env.sh" cat > "$cuda_activation_script" << 'EOF' # CUDA environment for Forge -export CUDA_VERSION=12.9 +export CUDA_VERSION=12.8 export NVCC=/usr/local/cuda-${CUDA_VERSION}/bin/nvcc export CUDA_NVCC_EXECUTABLE=/usr/local/cuda-${CUDA_VERSION}/bin/nvcc export CUDA_HOME=/usr/local/cuda-${CUDA_VERSION}