Skip to content
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
3abb3e8
enable vLLM upload with CUDA 12.8 build
ebsmothers Oct 20, 2025
48a11a5
install stable torch
ebsmothers Oct 20, 2025
395c1ab
copy vllm_reqs.txt from joe's pr
ebsmothers Oct 20, 2025
f0878f7
remove nightly index path
ebsmothers Oct 20, 2025
994b400
Merge branch 'main' into cuda-128
ebsmothers Oct 21, 2025
85421b5
use install script in GHA workflows
ebsmothers Oct 21, 2025
fa02756
move to install script
ebsmothers Oct 21, 2025
b370d97
update vLLM requirements files
ebsmothers Oct 21, 2025
4d2a103
[testing] run unit test on 3.10 only
ebsmothers Oct 21, 2025
ffdaa52
Revert "[testing] run unit test on 3.10 only"
ebsmothers Oct 21, 2025
f9a6620
wip changes
ebsmothers Oct 21, 2025
80462a1
Merge branch 'main' into cuda-128
ebsmothers Oct 21, 2025
5f260eb
comments
ebsmothers Oct 21, 2025
be0069a
remove unused wheels
ebsmothers Oct 21, 2025
6af94cd
more cleanup
ebsmothers Oct 21, 2025
9eb1bc1
update versions
ebsmothers Oct 21, 2025
4044dc1
simplify things
ebsmothers Oct 22, 2025
7a26f8a
revert docs changes
ebsmothers Oct 22, 2025
6c65fad
update requirements and script
ebsmothers Oct 22, 2025
e55e235
one more try
ebsmothers Oct 22, 2025
4ab4caf
try python 3.11
ebsmothers Oct 22, 2025
67a18c0
run only on 310
joecummings Oct 22, 2025
100f584
Comment out hopefully unnecessary parts of build
joecummings Oct 22, 2025
d10a8c9
You need a GPU runner to run stuff
joecummings Oct 22, 2025
95c2f25
Install six
joecummings Oct 22, 2025
c9b2a62
Simplify docs build
joecummings Oct 22, 2025
27f77df
setuptools < 80
joecummings Oct 22, 2025
0995062
ugh, quote it
joecummings Oct 22, 2025
49be5a1
Use py 312 in main ex
joecummings Oct 22, 2025
008fad8
Remove build refernce
joecummings Oct 22, 2025
dc79ad8
delete cpu unit test workflow
ebsmothers Oct 22, 2025
bb15ede
typo fix
ebsmothers Oct 22, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 3 additions & 12 deletions .github/workflows/docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,16 +35,7 @@ jobs:
- name: Update pip
shell: bash -l {0}
run: python -m pip install --upgrade pip
- name: Install pytorch
shell: bash -l {0}
run: pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu130 --force-reinstall
- name: Install monarch
shell: bash -l {0}
run: pip install assets/ci/monarch_no_torch-0.1.0.dev20251010-py3-none-any.whl
- name: Install torchforge
shell: bash -l {0}
env:
GH_TOKEN: ${{ github.token }}
run: ./scripts/install.sh
- name: Install docs dependencies
shell: bash -l {0}
Expand All @@ -58,9 +49,9 @@ jobs:
export LD_LIBRARY_PATH="${CONDA_PREFIX}/lib:${LD_LIBRARY_PATH:-}"

# Also set CUDA paths if needed
if [ -d "/usr/local/cuda-12.9" ]; then
export LD_LIBRARY_PATH="/usr/local/cuda-12.9/compat:${LD_LIBRARY_PATH}"
export CUDA_HOME=/usr/local/cuda-12.9
if [ -d "/usr/local/cuda-12.8" ]; then
export LD_LIBRARY_PATH="/usr/local/cuda-12.8/compat:${LD_LIBRARY_PATH}"
export CUDA_HOME=/usr/local/cuda-12.8
fi

# Verify dependencies can be imported before building docs
Expand Down
17 changes: 2 additions & 15 deletions .github/workflows/gpu_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,21 +40,8 @@ jobs:
python-version: ${{ matrix.python-version }}
- name: Update pip
run: python -m pip install --upgrade pip
- name: Install pinned torch nightly
run: python -m pip install --pre torch==2.9.0.dev20250905 --no-cache-dir --index-url https://download.pytorch.org/whl/nightly/cu129
- name: Download and install vLLM and its dependencies
# TODO: this honestly could not be hackier if I tried
run: |
python -m pip install -r .github/packaging/vllm_reqs_12_9.txt
python -m pip install vllm==0.10.1.dev0+g6d8d0a24c.d20251009.cu129 --no-cache-dir --index-url https://download.pytorch.org/whl/preview/forge
- name: Install Monarch
run: pip install torchmonarch==0.1.0rc1
- name: Install torchtitan and torchstore
run: |
python -m pip install git+https://github.com/pytorch/torchtitan.git
python -m pip install git+https://github.com/meta-pytorch/torchstore.git
- name: Install dependencies
run: python -m pip install --no-build-isolation -e ".[dev]"
- name: Install torchforge
run: ./scripts/install.sh
- name: Run unit tests with coverage
# TODO add all tests
run: |
Expand Down
14 changes: 2 additions & 12 deletions .github/workflows/unit_test.yaml
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These won't work b/c u need a GPU runner

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

wait why? they were working before, no?

Original file line number Diff line number Diff line change
Expand Up @@ -25,18 +25,8 @@ jobs:
python-version: ${{ matrix.python-version }}
- name: Update pip
run: python -m pip install --upgrade pip
- name: Install pytorch
run: python -m pip install torch==2.9.0.dev20250826 --extra-index-url https://download.pytorch.org/whl/nightly/cpu
- name: Install monarch
run: pip install assets/ci/monarch_no_torch-0.1.0.dev20251010-py3-none-any.whl
- name: Install torchstore
run: pip install assets/wheels/torchstore-0.1.0-py3-none-any.whl
- name: Install torchtitan
run: |
pip install assets/wheels/torchtitan-0.1.0-py3-none-any.whl
pip install tyro
- name: Install dependencies
run: python -m pip install --no-build-isolation -e ".[dev]"
- name: Install torchforge
run: ./scripts/install.sh
- name: Run unit tests with coverage
# TODO add all tests
run: pytest tests/unit_tests --cov=. --cov-report=xml --durations=20 -vv
Expand Down
2 changes: 1 addition & 1 deletion assets/versions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
# This file contains all pinned versions and commits for dependencies

# PyTorch version
PYTORCH_VERSION="2.9.0.dev20250905"
PYTORCH_VERSION="2.9.0"

# vLLM branch
VLLM_BRANCH="v0.10.0"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we change these to point to monarch stable, torchtitan stable and torchstore stable

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah right now I have them hardcoded. But good point. Honestly I need to see where else these versions are used.. there's a world where we just delete this file entirely to reduce indirection

Expand Down
154 changes: 13 additions & 141 deletions scripts/install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -95,128 +95,6 @@ detect_os_family() {
esac
}

# Install required system packages
install_system_packages() {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't need this anymore?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Idk maybe we do, just trying it out

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

brought it back just for safety

local use_sudo=${1:-false}

log_info "Installing required system packages..."

if [ "$use_sudo" = "true" ]; then
# User explicitly requested sudo installation
if sudo -n true 2>/dev/null; then
# Detect OS family using /etc/os-release
local os_family
os_family=$(detect_os_family)

case "$os_family" in
"rhel_fedora")
log_info "Detected RHEL/Fedora-based OS - using system package manager"
sudo dnf install -y libibverbs rdma-core libmlx5 libibverbs-devel rdma-core-devel
;;
"debian")
log_info "Detected Debian-based OS - using system package manager"
sudo apt-get update
sudo apt-get install -y libibverbs1 rdma-core libmlx5-1 libibverbs-dev rdma-core-dev
;;
"unknown")
log_error "Unsupported OS for automatic system package installation"
log_info "Supported distributions: RHEL/Fedora-based (rhel fedora) and Debian-based (debian)"
exit 1
;;
esac
log_info "System packages installed successfully via system package manager"
else
log_error "Sudo installation requested but no sudo access available"
log_info "Either run with sudo privileges or remove the --use-sudo flag to use conda"
exit 1
fi
else
# Default to conda installation
log_info "Installing system packages via conda (default method)"
conda install -c conda-forge rdma-core libibverbs-cos7-x86_64 -y
log_info "Conda package installation completed. Packages installed in conda environment."
fi
}

# Check to see if gh is installed, if not, it will be installed via conda-forge channel
check_gh_install() {
if ! command -v gh &> /dev/null; then
log_warning "GitHub CLI (gh) not found. Installing via Conda..."
conda install gh --channel conda-forge -y
log_info "GitHub CLI (gh) installed successfully."
log_info "Please run 'gh auth login' to authenticate with GitHub."
else
log_info "GitHub CLI (gh) already installed."
fi
}

# Check wheels exist
check_wheels() {
if [ ! -d "$WHEEL_DIR" ]; then
log_error "Wheels directory not found: $WHEEL_DIR"
exit 1
fi

local wheel_count=$(ls -1 "$WHEEL_DIR"/*.whl 2>/dev/null | wc -l)
log_info "Found $wheel_count local wheels"
}

# Download vLLM wheel from GitHub releases
download_vllm_wheel() {
log_info "Downloading vLLM wheel from GitHub releases..."

# Check if gh is installed
if ! command -v gh &> /dev/null; then
log_error "GitHub CLI (gh) is required to download vLLM wheel"
log_info "Install it with: sudo dnf install gh"
log_info "Then run: gh auth login"
exit 1
fi

# Get the vLLM wheel filename from the release
local vllm_wheel_name
vllm_wheel_name=$(gh release view "$RELEASE_TAG" --repo "$GITHUB_REPO" --json assets --jq '.assets[] | select(.name | contains("vllm")) | .name' | head -1)

if [ -z "$vllm_wheel_name" ]; then
log_error "Could not find vLLM wheel in release $RELEASE_TAG"
log_info "Make sure the vLLM wheel has been uploaded to the GitHub release"
exit 1
fi
for f in assets/wheels/vllm-*; do
[ -e "$f" ] || continue # skip if glob didn't match
if [ "$(basename "$f")" != "$vllm_wheel_name" ]; then
log_info "Removing stale vLLM wheel: $(basename "$f")"
rm -f "$f"
fi
done

local local_path="$WHEEL_DIR/$vllm_wheel_name"

if [ -f "$local_path" ]; then
log_info "vLLM wheel already downloaded: $vllm_wheel_name"
return 0
fi

log_info "Downloading: $vllm_wheel_name"

# Save current directory and change to wheel directory
local original_dir=$(pwd)
cd "$WHEEL_DIR"
gh release download "$RELEASE_TAG" --repo "$GITHUB_REPO" --pattern "*vllm*"
local download_result=$?

# Always return to original directory
cd "$original_dir"

if [ $download_result -eq 0 ]; then
log_info "Successfully downloaded vLLM wheel"
else
log_error "Failed to download vLLM wheel"
exit 1
fi
}


# Parse command line arguments
parse_args() {
USE_SUDO=false
Expand Down Expand Up @@ -255,33 +133,27 @@ main() {
echo "======================"
echo ""
echo "Note: Run this from the root of the forge repository"
echo "This script requires GitHub CLI (gh) to download large wheels"
if [ "$USE_SUDO" = "true" ]; then
echo "System packages will be installed via system package manager (requires sudo)"
check_sudo
else
echo "System packages will be installed via conda (default, safer)"
fi
echo ""

check_conda_env
check_wheels

# Install openssl as we overwrite the default version when we update LD_LIBRARY_PATH
conda install -y openssl

install_system_packages "$USE_SUDO"
check_gh_install
download_vllm_wheel

log_info "Installing PyTorch nightly..."
pip install torch==$PYTORCH_VERSION --index-url https://download.pytorch.org/whl/nightly/cu129
pip install torch==$PYTORCH_VERSION --index-url https://download.pytorch.org/whl/cu128

# Install vLLM and its requirements
pip install -r .github/packaging/vllm_reqs_12_8.txt
pip install vllm --no-cache-dir --index-url https://download.pytorch.org/whl/preview/forge

# Install monarch
pip install torchmonarch==0.1.0rc7

log_info "Installing all wheels (local + downloaded)..."
pip install "$WHEEL_DIR"/*.whl
# Install torchtitan and torchstore
pip install torchtitan==0.2.0
pip install torchstore==0.0.1rc2

log_info "Installing Forge from source..."
pip install -e .
pip install -e ".[dev]"

# Set up environment
log_info "Setting up environment..."
Expand All @@ -301,7 +173,7 @@ main() {
local cuda_activation_script="${conda_env_dir}/etc/conda/activate.d/cuda_env.sh"
cat > "$cuda_activation_script" << 'EOF'
# CUDA environment for Forge
export CUDA_VERSION=12.9
export CUDA_VERSION=12.8
export NVCC=/usr/local/cuda-${CUDA_VERSION}/bin/nvcc
export CUDA_NVCC_EXECUTABLE=/usr/local/cuda-${CUDA_VERSION}/bin/nvcc
export CUDA_HOME=/usr/local/cuda-${CUDA_VERSION}
Expand Down
Loading