Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
79 commits
Select commit Hold shift + click to select a range
1305232
test
casteryh Sep 5, 2025
4c7ca79
add back ubuntu-latest
casteryh Sep 5, 2025
0aecf7c
test
casteryh Sep 5, 2025
286dfaa
add cuda
casteryh Sep 5, 2025
2cb8389
lint
casteryh Sep 5, 2025
376856a
switch to gpu...
casteryh Sep 5, 2025
d2d4a1d
fix python
casteryh Sep 5, 2025
e5088d2
oops
casteryh Sep 5, 2025
746c80b
fix
casteryh Sep 5, 2025
c1e2bda
switch order
casteryh Sep 5, 2025
65d6b75
switch order
casteryh Sep 5, 2025
a20d3e0
test
casteryh Sep 5, 2025
4659a93
update setup-miniconda version
casteryh Sep 5, 2025
7e71ab9
fix
casteryh Sep 5, 2025
d0ab110
add conda activate
casteryh Sep 5, 2025
0bf4bad
debug: print PATH
casteryh Sep 5, 2025
cd8984a
add conda init
casteryh Sep 5, 2025
a0873f1
manually activate conda env
casteryh Sep 5, 2025
d603ad9
wut?
casteryh Sep 5, 2025
bc3e758
fix
casteryh Sep 5, 2025
b823a12
setup-miniconda doesn't work well with self-hosted runners, ad hoc fix
casteryh Sep 5, 2025
d0d5572
<Replace this line with a title. Use 1 line only, 67 chars or less>
casteryh Sep 5, 2025
8414480
add cuda
casteryh Sep 5, 2025
fb7485a
fix and cleanup
casteryh Sep 5, 2025
66b7cc7
cleanup
casteryh Sep 5, 2025
d41950f
??
casteryh Sep 5, 2025
7f73c9c
fix and switch order.
casteryh Sep 5, 2025
4847b6a
fix syntax
casteryh Sep 5, 2025
b642314
add torchtitan
casteryh Sep 5, 2025
998f4e6
fix
casteryh Sep 5, 2025
ba8e5b9
fix
casteryh Sep 5, 2025
bd8f12c
add titan secrets
casteryh Sep 5, 2025
9033dc4
add more dependencies
casteryh Sep 5, 2025
27a0786
add vllm
casteryh Sep 5, 2025
dc0f874
fix
casteryh Sep 5, 2025
d42fd90
?
casteryh Sep 5, 2025
3f1808b
test
casteryh Sep 18, 2025
797807a
add matrix to code-cov as well
casteryh Sep 18, 2025
951b4e5
fix indentation
casteryh Sep 18, 2025
044c703
fix indent
casteryh Sep 18, 2025
4c34745
fix secrets
casteryh Sep 18, 2025
3495b0b
fix secrets
casteryh Sep 18, 2025
f0e95d3
change runner type
casteryh Sep 18, 2025
58d1acb
fix cuda version
casteryh Sep 18, 2025
71c3850
fix secret name
casteryh Sep 18, 2025
7cd9627
ssh key debug
casteryh Sep 18, 2025
3019f6c
test
casteryh Sep 18, 2025
c0ac1fb
fix secrets
casteryh Sep 18, 2025
8fb2d64
fix secrets ??
casteryh Sep 18, 2025
5886d3f
debug
casteryh Sep 18, 2025
8c0ad8d
change to v1
casteryh Sep 18, 2025
061ba88
debug
casteryh Sep 18, 2025
e18f827
rearrange order
casteryh Sep 18, 2025
04d9e22
don't create new envs
casteryh Sep 18, 2025
5ec72ca
pin pytorch version
casteryh Sep 18, 2025
7d5d71b
remove python version
casteryh Sep 18, 2025
3ef9054
debug conda
casteryh Sep 18, 2025
7efccb8
used another docker image
casteryh Sep 18, 2025
e5c7aa8
add github.com to known_hosts
casteryh Sep 18, 2025
b940e2e
don't use torchtitan docker
casteryh Sep 18, 2025
37abb4f
disable conda init
casteryh Sep 18, 2025
0531092
fix torchtitan dependency
casteryh Sep 18, 2025
3318ee6
rearrange
casteryh Sep 18, 2025
917d6d3
rearrange
casteryh Sep 18, 2025
83a4cb1
fix torchtitan dependency
casteryh Sep 18, 2025
67ccd09
install torchtitan from source
casteryh Sep 18, 2025
1565bda
fix
casteryh Sep 18, 2025
de054c8
pin torch version
casteryh Sep 18, 2025
f672342
test
casteryh Sep 18, 2025
1285f5e
update glibc
casteryh Sep 19, 2025
3923590
yum update glibc
casteryh Sep 19, 2025
f1fd533
glibc debug
casteryh Sep 19, 2025
2d6c15e
new image
casteryh Sep 22, 2025
19bacb9
install monarch nightly
casteryh Sep 22, 2025
7e33505
cuda 12.8
casteryh Sep 22, 2025
f45fafd
rearrange things
casteryh Sep 22, 2025
ee53a13
fix monarch dependencies
casteryh Sep 22, 2025
a94d9d2
export ld library path
casteryh Sep 22, 2025
075f154
update ld library path
casteryh Sep 22, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions .github/workflows/activate_conda_env.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

conda init
source ~/.bashrc
conda activate test
3 changes: 3 additions & 0 deletions .github/workflows/github_fingerprints.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
github.com ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIOMqqnkVzrm0SdG6UOoqKLsabgH5C9okWi0dh2l9GKJl
github.com ecdsa-sha2-nistp256 AAAAE2VjZHNhLXNoYTItbmlzdHAyNTYAAAAIbmlzdHAyNTYAAABBBEmKSENjQEezOmxkZMy7opKgwFB9nkt5YRrYMjNuG5N87uRgg6CLrbo5wAdT/y6v0mKV0U2w0WZ2YB/++Tpockg=
github.com ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABgQCj7ndNxQowgcQnjshcLrqPEiiphnt+VTTvDP6mHBL9j1aNUkY4Ue1gvwnGLVlOhGeYrnZaMgRK6+PKCUXaDbC7qtbW8gIkhL7aGCsOr/C56SJMy/BCZfxd1nWzAOxSDPgVsmerOBYfNqltV9/hWCqBywINIR+5dIg6JTJ72pcEpEjcYgXkE2YEFXV1JHnsKgbLWNlhScqb2UmyRkQyytRLtL+38TGxkxCflmO+5Z8CSSNY7GidjMIZ7Q4zMjA2n1nGrlTDkzwDCsw+wqFPGQA179cnfGWOWRVruj16z6XyvxvjJwbz0wQZ75XK5tKSb7FNyeIEs4TT4jk+S4dhPeAUC5y+bDYirYgM4GC7uEnztnZyaVWQ7B381AK4Qdrwt51ZqExKbQpTUNn+EjqoTwvqNj4kqx5QUCI0ThS/YkOxJCXmPUWZbhjpCg56i+2aB6CmK2JGhn57K5mj0MNdBXA4/WnwH6XoPWJzK5Nyu2zB3nAZp+S5hpQs+p1vN1/wsjk=
139 changes: 110 additions & 29 deletions .github/workflows/unit_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,37 +5,118 @@ on:


jobs:
unit_tests:
runs-on: ubuntu-latest
timeout-minutes: 15
build-and-test:
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
secrets: inherit # Pass all secrets
strategy:
fail-fast: false
matrix:
include:
- name: 4xlargegpu
runs-on: linux.g5.4xlarge.nvidia.gpu
gpu-arch-type: "cuda"
gpu-arch-version: "12.8"
# adapted from torchtitan/.github/workflows/integration_test_8gpu_h100.yaml
with:
timeout: 120
runner: ${{ matrix.runs-on }}
gpu-arch-type: ${{ matrix.gpu-arch-type }}
gpu-arch-version: ${{ matrix.gpu-arch-version }}
submodules: recursive
upload-artifact: "coverage-report"
script: |
set -eux

conda create -n testenv python=3.10 --yes
conda activate testenv
echo "python version: $(python --version)"
echo "python path: $(which python)"

# Log CUDA driver version for debugging.
DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -n 1 || true)
echo "CUDA driver version: ${DRIVER_VERSION}"

export CUDA_LIB_DIR=/usr/lib64
export CUDA_VERSION=12.8
export NVCC=/usr/local/cuda-${CUDA_VERSION}/bin/nvcc
export CUDA_NVCC_EXECUTABLE=/usr/local/cuda-${CUDA_VERSION}/bin/nvcc
export CUDA_HOME=/usr/local/cuda-${CUDA_VERSION}
export PATH="${CUDA_HOME}/bin:$PATH"
export CUDA_INCLUDE_DIRS=$CUDA_HOME/include
export CUDA_CUDART_LIBRARY=$CUDA_HOME/lib64/libcudart.so
export LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH
export LIBRARY_PATH=$CUDA_HOME/lib64:$LIBRARY_PATH

export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/lib

# Test the existence of secrets and abort if they are not available
echo "$(ssh-agent -s)"
[[ "$SECRET_FORGE_GITHUB_CI_FOR_TORCHSTORE" != "" ]] || (echo "SECRET_FORGE_GITHUB_CI_FOR_TORCHSTORE is empty" && exit 1)

# Add github fingerprint to known hosts
mkdir -p ~/.ssh
cat .github/workflows/github_fingerprints.txt >> ~/.ssh/known_hosts

python -m pip config --user set global.progress_bar off

# Upgrade pip
python -m pip install --upgrade pip

# # Install pytorch
# python -m pip install torch==2.9.0.dev20250826 --extra-index-url https://download.pytorch.org/whl/nightly/cu126

# Install torchtitan
mkdir -p deps/torchtitan
git clone https://github.com/pytorch/torchtitan deps/torchtitan
python -m pip install -r deps/torchtitan/requirements.txt
python -m pip install -r deps/torchtitan/requirements.txt


# Install torchstore
eval "$(ssh-agent -s)"
echo "$SECRET_FORGE_GITHUB_CI_FOR_TORCHSTORE" >> torchstore_ssh_key
chmod 600 torchstore_ssh_key
# Print the first and last 2 lines of the key to make sure it's valid
echo "torchstore_ssh_key contents:"
head -2 torchstore_ssh_key
echo "..."
tail -2 torchstore_ssh_key
echo "End of torchstore_ssh_key contents"
ssh-add torchstore_ssh_key
python -m pip install git+ssh://[email protected]/meta-pytorch/torchstore.git

# Manually install some monarch dependencies
dnf update -y
dnf install clang-devel libunwind libunwind-devel -y
dnf install -y libibverbs rdma-core libmlx5 libibverbs-devel rdma-core-devel

# Install monarch nightly
git clone https://github.com/meta-pytorch/monarch deps/monarch
python deps/monarch/scripts/install_nightly.py

# Install dependencies
python -m pip install --no-build-isolation -e ".[dev]"

# monarch wheel is built with a newer glibc version. hopefully this doesn't break everything else.
echo $(ldd --version)
yum update -y
yum update glibc
echo $(ldd --version)
# Run unit tests with coverage
pytest tests/unit_tests --cov=. --cov-report=xml --durations=20 -vv
upload-coverage:
strategy:
matrix:
python-version: ['3.10', '3.11', '3.12']
needs: build-and-test
runs-on: ubuntu-latest
steps:
- name: Check out repo
uses: actions/checkout@v4
- name: Setup conda env
uses: conda-incubator/setup-miniconda@v2
- name: Download coverage artifact
uses: actions/download-artifact@v4
with:
auto-update-conda: true
miniconda-version: "latest"
activate-environment: test
python-version: ${{ matrix.python-version }}
- name: Update pip
run: python -m pip install --upgrade pip
- name: Install pytorch
run: python -m pip install torch==2.9.0.dev20250826 --extra-index-url https://download.pytorch.org/whl/nightly/cpu
- name: Install monarch
run: python -m pip install monarch-no-torch==0.1.0.dev20250826 --find-links assets/ci
- name: Install torchstore
run: |
eval "$(ssh-agent -s)"
ssh-add - <<< '${{ secrets.FORGE_GITHUB_CI_FOR_TORCHSTORE }}'
python -m pip install git+ssh://[email protected]/meta-pytorch/torchstore.git
- name: Install dependencies
run: python -m pip install --no-build-isolation -e ".[dev]"
- name: Run unit tests with coverage
# TODO add all tests
run: pytest tests/unit_tests --cov=. --cov-report=xml --durations=20 -vv
- name: Upload Coverage to Codecov
uses: codecov/codecov-action@v3
name: coverage-report
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v4
with:
files: coverage.xml
token: ${{ secrets.CODECOV_TOKEN }}
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@ dependencies = [
"torchdata>=0.8.0",
"torchtitan",
# vLLM
# TODO: pin specific vllm version
#"vllm==0.10.0",
"vllm==0.10.1",
# Hugging Face integrations
"transformers",
"datasets>=2.21.0",
"tokenizers",
# Miscellaneous
Expand Down
Loading