Skip to content

Commit e26cd46

Browse files
authored
Merge branch 'master' into bump/deepspeed
2 parents 588825e + e088694 commit e26cd46

File tree

19 files changed

+565
-181
lines changed

19 files changed

+565
-181
lines changed

.azure/gpu-benchmarks.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,9 @@ jobs:
100100
workingDirectory: tests/
101101
displayName: "Testing: benchmarks"
102102

103-
- bash: bash run_standalone_tasks.sh
103+
- bash: |
104+
bash run_standalone_tasks.sh cpu
105+
bash run_standalone_tasks.sh cuda
104106
workingDirectory: tests/parity_fabric
105107
# without succeeded this could run even if the job has already failed
106108
condition: and(succeeded(), eq(variables['PACKAGE_NAME'], 'fabric'))

.azure/gpu-tests-pytorch.yml

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,3 @@
1-
# Python package
2-
# Create and test a Python package on multiple Python versions.
3-
# Add steps that analyze code, save the dist with the build record, publish to a PyPI-compatible index, and more:
4-
# https://docs.microsoft.com/azure/devops/pipelines/languages/python
5-
61
trigger:
72
tags:
83
include: ["*"]
@@ -24,18 +19,18 @@ pr:
2419
- "examples/run_pl_examples.sh"
2520
- "examples/pytorch/basics/backbone_image_classifier.py"
2621
- "examples/pytorch/basics/autoencoder.py"
22+
- "requirements/fabric/**"
2723
- "requirements/pytorch/**"
2824
- "src/lightning/__init__.py"
2925
- "src/lightning/__setup__.py"
3026
- "src/lightning/__version__.py"
31-
- "src/lightning/pytorch/**"
27+
- "src/lightning_fabric/*"
28+
- "src/lightning/fabric/**"
3229
- "src/pytorch_lightning/*"
30+
- "src/lightning/pytorch/**"
3331
- "tests/tests_pytorch/**"
3432
- "tests/run_standalone_*.sh"
3533
- "pyproject.toml" # includes pytest config
36-
- "requirements/fabric/**"
37-
- "src/lightning/fabric/**"
38-
- "src/lightning_fabric/*"
3934
exclude:
4035
- "requirements/*/docs.txt"
4136
- "*.md"

.github/checkgroup.yml

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,10 @@ subprojects:
2121
checks:
2222
- "pl-cpu-guardian" # aggregated check for all cases
2323

24-
- id: "pytorch_lightning: Azure GPU"
24+
- id: "pytorch_lightning: lit GPU"
2525
paths:
2626
- ".actions/*"
27-
- ".azure/gpu-tests-pytorch.yml"
27+
- ".lightning/workflows/pytorch.yml"
2828
# only the azure GPU workflow runs the examples
2929
# all examples don't need to be added because they aren't used in CI, but these are
3030
- "examples/run_pl_examples.sh"
@@ -47,13 +47,13 @@ subprojects:
4747
- "!*.md"
4848
- "!**/*.md"
4949
checks:
50-
- "pytorch-lightning (GPUs) (testing Lightning | latest)"
51-
- "pytorch-lightning (GPUs) (testing PyTorch | oldest)"
52-
- "pytorch-lightning (GPUs) (testing PyTorch | latest)"
50+
- "pytorch.yml / Lit Job (nvidia/cuda:12.1.1-runtime-ubuntu22.04, pytorch, 3.10, L4_X_2)"
51+
- "pytorch.yml / Lit Job (nvidia/cuda:12.6.3-runtime-ubuntu22.04, lightning, 3.12, L4_X_2)"
52+
- "pytorch.yml / Lit Job (nvidia/cuda:12.6.3-runtime-ubuntu22.04, pytorch, 3.12, L4_X_2)"
5353

54-
- id: "pytorch_lightning: Benchmarks"
54+
- id: "Benchmarks"
5555
paths:
56-
- ".azure/gpu-benchmarks.yml"
56+
- ".lightning/workflows/benchmark.yml"
5757
- "requirements/fabric/**"
5858
- "requirements/pytorch/**"
5959
- "src/lightning/fabric/**"
@@ -65,7 +65,8 @@ subprojects:
6565
- "!*.md"
6666
- "!**/*.md"
6767
checks:
68-
- "lightning.Benchmarks"
68+
- "benchmark.yml / Lit Job (fabric)"
69+
- "benchmark.yml / Lit Job (pytorch)"
6970

7071
# Temporarily disabled
7172
# - id: "pytorch-lightning: TPU workflow"
@@ -128,10 +129,10 @@ subprojects:
128129
checks:
129130
- "fabric-cpu-guardian" # aggregated check for all cases
130131

131-
- id: "lightning_fabric: Azure GPU"
132+
- id: "lightning_fabric: lit GPU"
132133
paths:
133134
- ".actions/*"
134-
- ".azure/gpu-tests-fabric.yml"
135+
- ".lightning/workflows/fabric.yml"
135136
- "examples/fabric/**"
136137
- "examples/run_fabric_examples.sh"
137138
- "requirements/fabric/**"
@@ -147,9 +148,9 @@ subprojects:
147148
- "!*.md"
148149
- "!**/*.md"
149150
checks:
150-
- "lightning-fabric (GPUs) (testing Fabric | oldest)"
151-
- "lightning-fabric (GPUs) (testing Fabric | latest)"
152-
- "lightning-fabric (GPUs) (testing Lightning | latest)"
151+
- "fabric.yml / Lit Job (nvidia/cuda:12.1.1-runtime-ubuntu22.04, fabric, 3.10, L4_X_2)"
152+
- "fabric.yml / Lit Job (nvidia/cuda:12.6.3-runtime-ubuntu22.04, fabric, 3.12, L4_X_2)"
153+
- "fabric.yml / Lit Job (nvidia/cuda:12.6.3-runtime-ubuntu22.04, lightning, 3.12, L4_X_2)"
153154

154155
# Temporarily disabled
155156
# - id: "lightning_fabric: TPU workflow"

.github/markdown-links-config.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@
2323
}
2424
}
2525
],
26-
"timeout": "20s",
26+
"timeout": "30s",
2727
"retryOn429": true,
28-
"retryCount": 5,
29-
"fallbackRetryDelay": "20s"
28+
"retryCount": 10,
29+
"fallbackRetryDelay": "10s"
3030
}

.github/workflows/docs-build.yml

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -67,26 +67,33 @@ jobs:
6767
- uses: actions/checkout@v5
6868
with:
6969
ref: ${{ inputs.checkout }}
70+
token: ${{ secrets.GITHUB_TOKEN }}
7071
# only Pytorch has/uses notebooks
7172
submodules: ${{ matrix.pkg-name == 'pytorch' }}
7273
lfs: ${{ matrix.pkg-name == 'pytorch' }}
73-
- uses: actions/setup-python@v6
74+
75+
- name: Install uv and set Python version
76+
uses: astral-sh/setup-uv@v6
7477
with:
7578
python-version: "3.10"
79+
# TODO: Avoid activating environment like this
80+
# see: https://github.com/astral-sh/setup-uv/tree/v6/?tab=readme-ov-file#activate-environment
81+
activate-environment: true
82+
enable-cache: true
7683

7784
- name: List notebooks
7885
if: ${{ matrix.pkg-name == 'pytorch' }}
7986
working-directory: _notebooks/
8087
run: |
81-
pip install -q py-tree
88+
uv pip install -q py-tree
8289
py-tree .notebooks/
8390
ls -lhR .notebooks/
8491
8592
- name: Pull sphinx template
8693
run: |
87-
pip install -q -r requirements/ci.txt
94+
uv pip install -q -r requirements/ci.txt
8895
aws s3 sync --no-sign-request s3://sphinx-packages/ ${PYPI_LOCAL_DIR}
89-
pip install lai-sphinx-theme -U -f ${PYPI_LOCAL_DIR}
96+
uv pip install lai-sphinx-theme -U -f ${PYPI_LOCAL_DIR}
9097
9198
- name: pip wheels cache
9299
uses: actions/cache/restore@v4
@@ -100,25 +107,29 @@ jobs:
100107
run: |
101108
sudo apt-get update --fix-missing
102109
sudo apt-get install -y pandoc
110+
103111
- name: Install package & dependencies
104112
timeout-minutes: 20
105113
run: |
106114
mkdir -p ${PYPI_CACHE_DIR} # in case cache was not hit
107115
ls -lh ${PYPI_CACHE_DIR}
108-
pip install .[all] -U -r requirements/${{ matrix.pkg-name }}/docs.txt \
116+
uv pip install .[all] -U -r requirements/${{ matrix.pkg-name }}/docs.txt \
109117
-f ${PYPI_LOCAL_DIR} -f ${PYPI_CACHE_DIR} --extra-index-url="${TORCH_URL}"
110-
pip list
118+
uv pip list
119+
111120
- name: Install req. for Notebooks/tutorials
112121
if: matrix.pkg-name == 'pytorch'
113122
timeout-minutes: 10
114-
run: pip install -q -r _notebooks/.actions/requires.txt
123+
run: uv pip install -q -r _notebooks/.actions/requires.txt
115124

116125
- name: Full build for deployment
117126
if: github.event_name != 'pull_request'
118127
run: echo "DOCS_FETCH_ASSETS=1" >> $GITHUB_ENV
128+
119129
- name: Build without warnings
120130
if: github.event_name != 'workflow_dispatch'
121131
run: echo "BUILD_SPHINX_OPTS=-W --keep-going" >> $GITHUB_ENV
132+
122133
- name: Make ${{ matrix.target }}
123134
working-directory: ./docs/source-${{ matrix.pkg-name }}
124135
# allow failing link check and doctest if you run with dispatch
@@ -128,6 +139,7 @@ jobs:
128139
- name: Keep artifact
129140
if: github.event_name == 'pull_request'
130141
run: echo "ARTIFACT_DAYS=7" >> $GITHUB_ENV
142+
131143
- name: Upload built docs
132144
if: ${{ matrix.target == 'html' }}
133145
uses: actions/upload-artifact@v4

.gitmodules

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
[submodule "_notebooks"]
22
path = _notebooks
3-
url = https://github.com/Lightning-AI/lightning-tutorials.git
3+
url = https://github.com/Lightning-AI/tutorials.git
44
branch = publication

.lightning/workflows/benchmark.yml

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
trigger:
2+
push:
3+
branches: ["master", "release/stable"]
4+
pull_request:
5+
branches: ["master", "release/stable"]
6+
7+
timeout: "90" # minutes
8+
parametrize:
9+
matrix:
10+
PACKAGE_NAME: ["fabric", "pytorch"]
11+
image: "nvidia/cuda:12.1.1-runtime-ubuntu22.04"
12+
machine: "L4_X_2"
13+
env:
14+
TZ: "Etc/UTC"
15+
DEBIAN_FRONTEND: "noninteractive"
16+
python_version: "3.12"
17+
MKL_THREADING_LAYER: "GNU"
18+
CUDA_LAUNCH_BLOCKING: "1"
19+
NCCL_DEBUG: "INFO"
20+
TORCHDYNAMO_VERBOSE: "1"
21+
FREEZE_REQUIREMENTS: "1"
22+
RUN_ONLY_CUDA_TESTS: "1"
23+
24+
run: |
25+
echo "Installing dependencies"
26+
apt-get update -qq --fix-missing -o=Dpkg::Use-Pty=0 &> /dev/null
27+
apt-get install -q -y software-properties-common curl
28+
echo "Add deadsnakes PPA for newer Python versions if needed"
29+
add-apt-repository ppa:deadsnakes/ppa -y
30+
apt-get update -qq --fix-missing -o=Dpkg::Use-Pty=0 &> /dev/null
31+
echo "Install Python ${python_version} and other dependencies"
32+
apt-get install -q -y --no-install-recommends --allow-downgrades --allow-change-held-packages \
33+
build-essential \
34+
pkg-config \
35+
cmake \
36+
ca-certificates \
37+
libopenmpi-dev \
38+
openmpi-bin
39+
40+
echo "Install Python ${python_version} and UV"
41+
apt-get install -y python${python_version} python${python_version}-venv python${python_version}-dev
42+
ln -sf /usr/bin/python${python_version} /usr/bin/python
43+
curl -LsSf https://astral.sh/uv/install.sh | sh
44+
45+
echo "Source the environment and ensure UV is in PATH"
46+
[ -f "$HOME/.local/bin/env" ] && . "$HOME/.local/bin/env"
47+
export PATH="$HOME/.local/bin:$PATH"
48+
source $HOME/.cargo/env 2>/dev/null || true
49+
export PATH="$HOME/.cargo/bin:$PATH"
50+
51+
echo "Verify UV installation"
52+
command -v uv || (echo "UV not found in PATH" && exit 1)
53+
# Create and activate a local uv virtual environment
54+
uv venv .venv -p "/usr/bin/python${python_version}" || uv venv .venv -p "python${python_version}" || uv venv .venv
55+
. .venv/bin/activate
56+
hash -r
57+
58+
echo "Show system information"
59+
whereis nvidia
60+
nvidia-smi
61+
python --version
62+
uv --version
63+
uv pip list
64+
set -ex
65+
66+
# Parse CUDA version from image tag, e.g., "nvidia/cuda:12.6.3-devel-ubuntu22.04"
67+
IMAGE_TAG="${image##*:}" # "12.6.3-devel-ubuntu22.04"
68+
CUDA_VERSION="${IMAGE_TAG%%-*}" # "12.6.3"
69+
echo "Using CUDA version: ${CUDA_VERSION}"
70+
CUDA_VERSION_M_M="${CUDA_VERSION%.*}" # "12.6"
71+
CUDA_VERSION_MM="${CUDA_VERSION_M_M//./}" # "126"
72+
export UV_TORCH_BACKEND=cu${CUDA_VERSION_MM}
73+
74+
echo "Adjust tests"
75+
uv pip install -q -r .actions/requirements.txt
76+
python .actions/assistant.py copy_replace_imports --source_dir="./tests" \
77+
--source_import="lightning.fabric,lightning.pytorch" \
78+
--target_import="lightning_fabric,pytorch_lightning"
79+
80+
echo "Install package"
81+
uv pip install ".[dev]"
82+
83+
# Env details
84+
python requirements/collect_env_details.py
85+
python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu >= 2, f'GPU: {mgpu}'"
86+
87+
cd tests/
88+
echo "Testing: benchmarks"
89+
export PL_RUNNING_BENCHMARKS=1
90+
python -m pytest parity_${PACKAGE_NAME} -v --durations=0
91+
export PL_RUNNING_BENCHMARKS=0
92+
93+
echo "Testing: fabric standalone tasks"
94+
export PL_RUN_STANDALONE_TESTS=1
95+
if [ "${PACKAGE_NAME}" == "fabric" ]; then
96+
cd parity_fabric/
97+
bash run_standalone_tasks.sh cuda
98+
cd ..
99+
fi
100+
export PL_RUN_STANDALONE_TESTS=0
101+
102+
cd ..
103+
echo "Benchmarks completed successfully"

.lightning/workflows/fabric.yml

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ parametrize:
1212
- image: "nvidia/cuda:12.1.1-runtime-ubuntu22.04"
1313
PACKAGE_NAME: "fabric"
1414
python_version: "3.10"
15-
machine: "A100_X_2"
15+
machine: "L4_X_2"
1616
- image: "nvidia/cuda:12.6.3-runtime-ubuntu22.04"
1717
PACKAGE_NAME: "fabric"
1818
python_version: "3.12"
@@ -37,12 +37,13 @@ env:
3737
RUN_ONLY_CUDA_TESTS: "1"
3838

3939
run: |
40-
# Install Python and UV
41-
apt-get update -qq --fix-missing
40+
echo "Installing dependencies"
41+
apt-get update -qq --fix-missing -o=Dpkg::Use-Pty=0 &> /dev/null
4242
apt-get install -q -y software-properties-common curl
43-
# Add deadsnakes PPA for newer Python versions if needed
43+
echo "Add deadsnakes PPA for newer Python versions if needed"
4444
add-apt-repository ppa:deadsnakes/ppa -y
45-
apt-get update -qq --fix-missing
45+
apt-get update -qq --fix-missing -o=Dpkg::Use-Pty=0 &> /dev/null
46+
echo "Install Python ${python_version} and other dependencies"
4647
apt-get install -q -y --no-install-recommends --allow-downgrades --allow-change-held-packages \
4748
build-essential \
4849
pkg-config \
@@ -54,23 +55,25 @@ run: |
5455
libnccl2 \
5556
libnccl-dev
5657
58+
echo "Install Python ${python_version} and UV"
5759
apt-get install -y python${python_version} python${python_version}-venv python${python_version}-dev
5860
ln -sf /usr/bin/python${python_version} /usr/bin/python
5961
curl -LsSf https://astral.sh/uv/install.sh | sh
6062
61-
# Source the environment and ensure UV is in PATH
63+
echo "Source the environment and ensure UV is in PATH"
6264
[ -f "$HOME/.local/bin/env" ] && . "$HOME/.local/bin/env"
6365
export PATH="$HOME/.local/bin:$PATH"
6466
source $HOME/.cargo/env 2>/dev/null || true
6567
export PATH="$HOME/.cargo/bin:$PATH"
6668
67-
# Verify UV installation
69+
echo "Verify UV installation"
6870
command -v uv || (echo "UV not found in PATH" && exit 1)
6971
# Create and activate a local uv virtual environment
7072
uv venv .venv -p "/usr/bin/python${python_version}" || uv venv .venv -p "python${python_version}" || uv venv .venv
7173
. .venv/bin/activate
7274
hash -r
7375
76+
echo "Show system information"
7477
whereis nvidia
7578
nvidia-smi
7679
python --version
@@ -98,7 +101,7 @@ run: |
98101
uv pip install "cython<3.0" wheel # for compatibility
99102
fi
100103
101-
# install the base so we can adjust other packages
104+
echo "Install the base so we can adjust other packages"
102105
uv pip install .
103106
echo "Adjust torch versions in requirements files"
104107
PYTORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])")
@@ -119,6 +122,7 @@ run: |
119122
--target_import="lightning_fabric"
120123
fi
121124
125+
echo "Install package with [${PACKAGE_NAME}] extras"
122126
extra=$(python -c "print({'lightning': 'fabric-'}.get('$(PACKAGE_NAME)', ''))")
123127
uv pip install ".[${extra}dev]" --upgrade
124128

0 commit comments

Comments
 (0)