Skip to content

Commit d69a3f9

Browse files
authored
Merge branch 'master' into lr-finder
2 parents 7145575 + b7ca4d3 commit d69a3f9

File tree

11 files changed

+362
-150
lines changed

11 files changed

+362
-150
lines changed

.azure/gpu-benchmarks.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,9 @@ jobs:
100100
workingDirectory: tests/
101101
displayName: "Testing: benchmarks"
102102

103-
- bash: bash run_standalone_tasks.sh
103+
- bash: |
104+
bash run_standalone_tasks.sh cpu
105+
bash run_standalone_tasks.sh cuda
104106
workingDirectory: tests/parity_fabric
105107
# without succeeded this could run even if the job has already failed
106108
condition: and(succeeded(), eq(variables['PACKAGE_NAME'], 'fabric'))

.azure/gpu-tests-pytorch.yml

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,3 @@
1-
# Python package
2-
# Create and test a Python package on multiple Python versions.
3-
# Add steps that analyze code, save the dist with the build record, publish to a PyPI-compatible index, and more:
4-
# https://docs.microsoft.com/azure/devops/pipelines/languages/python
5-
61
trigger:
72
tags:
83
include: ["*"]
@@ -24,18 +19,18 @@ pr:
2419
- "examples/run_pl_examples.sh"
2520
- "examples/pytorch/basics/backbone_image_classifier.py"
2621
- "examples/pytorch/basics/autoencoder.py"
22+
- "requirements/fabric/**"
2723
- "requirements/pytorch/**"
2824
- "src/lightning/__init__.py"
2925
- "src/lightning/__setup__.py"
3026
- "src/lightning/__version__.py"
31-
- "src/lightning/pytorch/**"
27+
- "src/lightning_fabric/*"
28+
- "src/lightning/fabric/**"
3229
- "src/pytorch_lightning/*"
30+
- "src/lightning/pytorch/**"
3331
- "tests/tests_pytorch/**"
3432
- "tests/run_standalone_*.sh"
3533
- "pyproject.toml" # includes pytest config
36-
- "requirements/fabric/**"
37-
- "src/lightning/fabric/**"
38-
- "src/lightning_fabric/*"
3934
exclude:
4035
- "requirements/*/docs.txt"
4136
- "*.md"

.github/checkgroup.yml

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,10 @@ subprojects:
2121
checks:
2222
- "pl-cpu-guardian" # aggregated check for all cases
2323

24-
- id: "pytorch_lightning: Azure GPU"
24+
- id: "pytorch_lightning: lit GPU"
2525
paths:
2626
- ".actions/*"
27-
- ".azure/gpu-tests-pytorch.yml"
27+
- ".lightning/workflows/pytorch.yml"
2828
# only the azure GPU workflow runs the examples
2929
# all examples don't need to be added because they aren't used in CI, but these are
3030
- "examples/run_pl_examples.sh"
@@ -47,13 +47,13 @@ subprojects:
4747
- "!*.md"
4848
- "!**/*.md"
4949
checks:
50-
- "pytorch-lightning (GPUs) (testing Lightning | latest)"
51-
- "pytorch-lightning (GPUs) (testing PyTorch | oldest)"
52-
- "pytorch-lightning (GPUs) (testing PyTorch | latest)"
50+
- "pytorch.yml / Lit Job (nvidia/cuda:12.1.1-runtime-ubuntu22.04, pytorch, 3.10, A100_X_2)"
51+
- "pytorch.yml / Lit Job (nvidia/cuda:12.6.3-runtime-ubuntu22.04, lightning, 3.12, L4_X_2)"
52+
- "pytorch.yml / Lit Job (nvidia/cuda:12.6.3-runtime-ubuntu22.04, pytorch, 3.12, L4_X_2)"
5353

54-
- id: "pytorch_lightning: Benchmarks"
54+
- id: "Benchmarks"
5555
paths:
56-
- ".azure/gpu-benchmarks.yml"
56+
- ".lightning/workflows/benchmark.yml"
5757
- "requirements/fabric/**"
5858
- "requirements/pytorch/**"
5959
- "src/lightning/fabric/**"
@@ -65,7 +65,8 @@ subprojects:
6565
- "!*.md"
6666
- "!**/*.md"
6767
checks:
68-
- "lightning.Benchmarks"
68+
- "benchmark.yml / Lit Job (fabric)"
69+
- "benchmark.yml / Lit Job (pytorch)"
6970

7071
# Temporarily disabled
7172
# - id: "pytorch-lightning: TPU workflow"
@@ -128,10 +129,10 @@ subprojects:
128129
checks:
129130
- "fabric-cpu-guardian" # aggregated check for all cases
130131

131-
- id: "lightning_fabric: Azure GPU"
132+
- id: "lightning_fabric: lit GPU"
132133
paths:
133134
- ".actions/*"
134-
- ".azure/gpu-tests-fabric.yml"
135+
- ".lightning/workflows/fabric.yml"
135136
- "examples/fabric/**"
136137
- "examples/run_fabric_examples.sh"
137138
- "requirements/fabric/**"
@@ -147,9 +148,9 @@ subprojects:
147148
- "!*.md"
148149
- "!**/*.md"
149150
checks:
150-
- "lightning-fabric (GPUs) (testing Fabric | oldest)"
151-
- "lightning-fabric (GPUs) (testing Fabric | latest)"
152-
- "lightning-fabric (GPUs) (testing Lightning | latest)"
151+
- "fabric.yml / Lit Job (nvidia/cuda:12.1.1-runtime-ubuntu22.04, fabric, 3.10, A100_X_2)"
152+
- "fabric.yml / Lit Job (nvidia/cuda:12.6.3-runtime-ubuntu22.04, fabric, 3.12, L4_X_2)"
153+
- "fabric.yml / Lit Job (nvidia/cuda:12.6.3-runtime-ubuntu22.04, lightning, 3.12, L4_X_2)"
153154

154155
# Temporarily disabled
155156
# - id: "lightning_fabric: TPU workflow"

.lightning/workflows/benchmark.yml

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
trigger:
2+
push:
3+
branches: ["master", "release/stable"]
4+
pull_request:
5+
branches: ["master", "release/stable"]
6+
7+
timeout: "90" # minutes
8+
parametrize:
9+
matrix:
10+
PACKAGE_NAME: ["fabric", "pytorch"]
11+
image: "nvidia/cuda:12.1.1-runtime-ubuntu22.04"
12+
machine: "L4_X_2"
13+
env:
14+
TZ: "Etc/UTC"
15+
DEBIAN_FRONTEND: "noninteractive"
16+
python_version: "3.12"
17+
MKL_THREADING_LAYER: "GNU"
18+
CUDA_LAUNCH_BLOCKING: "1"
19+
NCCL_DEBUG: "INFO"
20+
TORCHDYNAMO_VERBOSE: "1"
21+
FREEZE_REQUIREMENTS: "1"
22+
RUN_ONLY_CUDA_TESTS: "1"
23+
24+
run: |
25+
# Install Python and UV
26+
apt-get update -qq --fix-missing
27+
apt-get install -q -y software-properties-common curl
28+
# Add deadsnakes PPA for newer Python versions if needed
29+
add-apt-repository ppa:deadsnakes/ppa -y
30+
apt-get update -qq --fix-missing
31+
apt-get install -q -y --no-install-recommends --allow-downgrades --allow-change-held-packages \
32+
build-essential \
33+
pkg-config \
34+
cmake \
35+
ca-certificates \
36+
libopenmpi-dev \
37+
openmpi-bin
38+
39+
apt-get install -y python${python_version} python${python_version}-venv python${python_version}-dev
40+
ln -sf /usr/bin/python${python_version} /usr/bin/python
41+
curl -LsSf https://astral.sh/uv/install.sh | sh
42+
43+
# Source the environment and ensure UV is in PATH
44+
[ -f "$HOME/.local/bin/env" ] && . "$HOME/.local/bin/env"
45+
export PATH="$HOME/.local/bin:$PATH"
46+
source $HOME/.cargo/env 2>/dev/null || true
47+
export PATH="$HOME/.cargo/bin:$PATH"
48+
49+
# Verify UV installation
50+
command -v uv || (echo "UV not found in PATH" && exit 1)
51+
# Create and activate a local uv virtual environment
52+
uv venv .venv -p "/usr/bin/python${python_version}" || uv venv .venv -p "python${python_version}" || uv venv .venv
53+
. .venv/bin/activate
54+
hash -r
55+
56+
whereis nvidia
57+
nvidia-smi
58+
python --version
59+
uv --version
60+
uv pip list
61+
set -ex
62+
63+
# Parse CUDA version from image tag, e.g., "nvidia/cuda:12.6.3-devel-ubuntu22.04"
64+
IMAGE_TAG="${image##*:}" # "12.6.3-devel-ubuntu22.04"
65+
CUDA_VERSION="${IMAGE_TAG%%-*}" # "12.6.3"
66+
echo "Using CUDA version: ${CUDA_VERSION}"
67+
CUDA_VERSION_M_M="${CUDA_VERSION%.*}" # "12.6"
68+
CUDA_VERSION_MM="${CUDA_VERSION_M_M//./}" # "126"
69+
export UV_TORCH_BACKEND=cu${CUDA_VERSION_MM}
70+
71+
# Adjust tests
72+
uv pip install -q -r .actions/requirements.txt
73+
python .actions/assistant.py copy_replace_imports --source_dir="./tests" \
74+
--source_import="lightning.fabric,lightning.pytorch" \
75+
--target_import="lightning_fabric,pytorch_lightning"
76+
77+
# Install package
78+
uv pip install ".[dev]"
79+
80+
# Env details
81+
python requirements/collect_env_details.py
82+
python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu >= 2, f'GPU: {mgpu}'"
83+
84+
cd tests/
85+
# Testing: benchmarks
86+
export PL_RUNNING_BENCHMARKS=1
87+
python -m pytest parity_${PACKAGE_NAME} -v --durations=0
88+
export PL_RUNNING_BENCHMARKS=0
89+
90+
# Testing: fabric standalone tasks
91+
export PL_RUN_STANDALONE_TESTS=1
92+
if [ "${PACKAGE_NAME}" == "fabric" ]; then
93+
cd parity_fabric/
94+
bash run_standalone_tasks.sh cuda
95+
cd ..
96+
fi
97+
export PL_RUN_STANDALONE_TESTS=0
98+
99+
cd ..
100+
echo "Benchmarks completed successfully"

0 commit comments

Comments
 (0)