Skip to content

Commit 3302098

Browse files
authored
Merge branch 'master' into docs/lightning_module_enhancements
2 parents 3b3a12c + bd1f3fd commit 3302098

File tree

4 files changed

+112
-6
lines changed

4 files changed

+112
-6
lines changed

.azure/gpu-benchmarks.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,9 @@ jobs:
100100
workingDirectory: tests/
101101
displayName: "Testing: benchmarks"
102102

103-
- bash: bash run_standalone_tasks.sh
103+
- bash: |
104+
bash run_standalone_tasks.sh cpu
105+
bash run_standalone_tasks.sh cuda
104106
workingDirectory: tests/parity_fabric
105107
# without succeeded this could run even if the job has already failed
106108
condition: and(succeeded(), eq(variables['PACKAGE_NAME'], 'fabric'))

.github/checkgroup.yml

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,9 +51,9 @@ subprojects:
5151
- "pytorch.yml / Lit Job (nvidia/cuda:12.6.3-runtime-ubuntu22.04, lightning, 3.12, L4_X_2)"
5252
- "pytorch.yml / Lit Job (nvidia/cuda:12.6.3-runtime-ubuntu22.04, pytorch, 3.12, L4_X_2)"
5353

54-
- id: "pytorch_lightning: Benchmarks"
54+
- id: "Benchmarks"
5555
paths:
56-
- ".azure/gpu-benchmarks.yml"
56+
- ".lightning/workflows/benchmark.yml"
5757
- "requirements/fabric/**"
5858
- "requirements/pytorch/**"
5959
- "src/lightning/fabric/**"
@@ -65,7 +65,8 @@ subprojects:
6565
- "!*.md"
6666
- "!**/*.md"
6767
checks:
68-
- "lightning.Benchmarks"
68+
- "benchmark.yml / Lit Job (fabric)"
69+
- "benchmark.yml / Lit Job (pytorch)"
6970

7071
# Temporarily disabled
7172
# - id: "pytorch-lightning: TPU workflow"

.lightning/workflows/benchmark.yml

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
trigger:
2+
push:
3+
branches: ["master", "release/stable"]
4+
pull_request:
5+
branches: ["master", "release/stable"]
6+
7+
timeout: "90" # minutes
8+
parametrize:
9+
matrix:
10+
PACKAGE_NAME: ["fabric", "pytorch"]
11+
image: "nvidia/cuda:12.1.1-runtime-ubuntu22.04"
12+
machine: "L4_X_2"
13+
env:
14+
TZ: "Etc/UTC"
15+
DEBIAN_FRONTEND: "noninteractive"
16+
python_version: "3.12"
17+
MKL_THREADING_LAYER: "GNU"
18+
CUDA_LAUNCH_BLOCKING: "1"
19+
NCCL_DEBUG: "INFO"
20+
TORCHDYNAMO_VERBOSE: "1"
21+
FREEZE_REQUIREMENTS: "1"
22+
RUN_ONLY_CUDA_TESTS: "1"
23+
24+
run: |
25+
# Install Python and UV
26+
apt-get update -qq --fix-missing
27+
apt-get install -q -y software-properties-common curl
28+
# Add deadsnakes PPA for newer Python versions if needed
29+
add-apt-repository ppa:deadsnakes/ppa -y
30+
apt-get update -qq --fix-missing
31+
apt-get install -q -y --no-install-recommends --allow-downgrades --allow-change-held-packages \
32+
build-essential \
33+
pkg-config \
34+
cmake \
35+
ca-certificates \
36+
libopenmpi-dev \
37+
openmpi-bin
38+
39+
apt-get install -y python${python_version} python${python_version}-venv python${python_version}-dev
40+
ln -sf /usr/bin/python${python_version} /usr/bin/python
41+
curl -LsSf https://astral.sh/uv/install.sh | sh
42+
43+
# Source the environment and ensure UV is in PATH
44+
[ -f "$HOME/.local/bin/env" ] && . "$HOME/.local/bin/env"
45+
export PATH="$HOME/.local/bin:$PATH"
46+
source $HOME/.cargo/env 2>/dev/null || true
47+
export PATH="$HOME/.cargo/bin:$PATH"
48+
49+
# Verify UV installation
50+
command -v uv || (echo "UV not found in PATH" && exit 1)
51+
# Create and activate a local uv virtual environment
52+
uv venv .venv -p "/usr/bin/python${python_version}" || uv venv .venv -p "python${python_version}" || uv venv .venv
53+
. .venv/bin/activate
54+
hash -r
55+
56+
whereis nvidia
57+
nvidia-smi
58+
python --version
59+
uv --version
60+
uv pip list
61+
set -ex
62+
63+
# Parse CUDA version from image tag, e.g., "nvidia/cuda:12.6.3-devel-ubuntu22.04"
64+
IMAGE_TAG="${image##*:}" # "12.6.3-devel-ubuntu22.04"
65+
CUDA_VERSION="${IMAGE_TAG%%-*}" # "12.6.3"
66+
echo "Using CUDA version: ${CUDA_VERSION}"
67+
CUDA_VERSION_M_M="${CUDA_VERSION%.*}" # "12.6"
68+
CUDA_VERSION_MM="${CUDA_VERSION_M_M//./}" # "126"
69+
export UV_TORCH_BACKEND=cu${CUDA_VERSION_MM}
70+
71+
# Adjust tests
72+
uv pip install -q -r .actions/requirements.txt
73+
python .actions/assistant.py copy_replace_imports --source_dir="./tests" \
74+
--source_import="lightning.fabric,lightning.pytorch" \
75+
--target_import="lightning_fabric,pytorch_lightning"
76+
77+
# Install package
78+
uv pip install ".[dev]"
79+
80+
# Env details
81+
python requirements/collect_env_details.py
82+
python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu >= 2, f'GPU: {mgpu}'"
83+
84+
cd tests/
85+
# Testing: benchmarks
86+
export PL_RUNNING_BENCHMARKS=1
87+
python -m pytest parity_${PACKAGE_NAME} -v --durations=0
88+
export PL_RUNNING_BENCHMARKS=0
89+
90+
# Testing: fabric standalone tasks
91+
export PL_RUN_STANDALONE_TESTS=1
92+
if [ "${PACKAGE_NAME}" == "fabric" ]; then
93+
cd parity_fabric/
94+
bash run_standalone_tasks.sh cuda
95+
cd ..
96+
fi
97+
export PL_RUN_STANDALONE_TESTS=0
98+
99+
cd ..
100+
echo "Benchmarks completed successfully"

tests/parity_fabric/run_standalone_tasks.sh

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,10 @@ export PYTHONPATH="${PYTHONPATH}:$(pwd)"
1818
export PYTHONPATH="${PYTHONPATH}:$(pwd)/.."
1919

2020
MAX_RETRIES=3
21+
# parsing argument from call like `bash run_standalone_tasks.sh cuda`
22+
ACCELERATOR=$1
23+
# optional tolerance argument, default to 0.01
24+
TOLERANCE=${2:-0.01}
2125

2226
retry_command() {
2327
local command="$@"
@@ -39,5 +43,4 @@ retry_command() {
3943
return $exit_code
4044
}
4145

42-
retry_command "python -m test_parity_ddp --accelerator="cpu" --devices=2 --tolerance=0.02"
43-
retry_command "python -m test_parity_ddp --accelerator="cuda" --devices=2 --tolerance=0.01"
46+
retry_command "python -m test_parity_ddp --accelerator="$ACCELERATOR" --devices=2 --tolerance=$TOLERANCE"

0 commit comments

Comments
 (0)