Skip to content

Commit 4b1079a

Browse files
convert benchmarks to litCI (#21199)
* convert benchmarks to litCI * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * python_version * image * install * noninteractive * uv * uv * cuda --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent da0e646 commit 4b1079a

File tree

2 files changed

+105
-2
lines changed

2 files changed

+105
-2
lines changed

.lightning/workflows/benchmark.yml

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
trigger:
2+
push:
3+
branches: ["master", "release/stable"]
4+
pull_request:
5+
branches: ["master", "release/stable"]
6+
7+
timeout: "90" # minutes
8+
parametrize:
9+
matrix:
10+
PACKAGE_NAME: ["fabric", "pytorch"]
11+
image: "nvidia/cuda:12.1.1-runtime-ubuntu22.04"
12+
machine: "L4_X_2"
13+
env:
14+
TZ: "Etc/UTC"
15+
DEBIAN_FRONTEND: "noninteractive"
16+
python_version: "3.12"
17+
MKL_THREADING_LAYER: "GNU"
18+
CUDA_LAUNCH_BLOCKING: "1"
19+
NCCL_DEBUG: "INFO"
20+
TORCHDYNAMO_VERBOSE: "1"
21+
FREEZE_REQUIREMENTS: "1"
22+
RUN_ONLY_CUDA_TESTS: "1"
23+
24+
run: |
25+
# Install Python and UV
26+
apt-get update -qq --fix-missing
27+
apt-get install -q -y software-properties-common curl
28+
# Add deadsnakes PPA for newer Python versions if needed
29+
add-apt-repository ppa:deadsnakes/ppa -y
30+
apt-get update -qq --fix-missing
31+
apt-get install -q -y --no-install-recommends --allow-downgrades --allow-change-held-packages \
32+
build-essential \
33+
pkg-config \
34+
cmake \
35+
ca-certificates \
36+
libopenmpi-dev \
37+
openmpi-bin
38+
39+
apt-get install -y python${python_version} python${python_version}-venv python${python_version}-dev
40+
ln -sf /usr/bin/python${python_version} /usr/bin/python
41+
curl -LsSf https://astral.sh/uv/install.sh | sh
42+
43+
# Source the environment and ensure UV is in PATH
44+
[ -f "$HOME/.local/bin/env" ] && . "$HOME/.local/bin/env"
45+
export PATH="$HOME/.local/bin:$PATH"
46+
source $HOME/.cargo/env 2>/dev/null || true
47+
export PATH="$HOME/.cargo/bin:$PATH"
48+
49+
# Verify UV installation
50+
command -v uv || (echo "UV not found in PATH" && exit 1)
51+
# Create and activate a local uv virtual environment
52+
uv venv .venv -p "/usr/bin/python${python_version}" || uv venv .venv -p "python${python_version}" || uv venv .venv
53+
. .venv/bin/activate
54+
hash -r
55+
56+
whereis nvidia
57+
nvidia-smi
58+
python --version
59+
uv --version
60+
uv pip list
61+
set -ex
62+
63+
# Parse CUDA version from image tag, e.g., "nvidia/cuda:12.6.3-devel-ubuntu22.04"
64+
IMAGE_TAG="${image##*:}" # "12.6.3-devel-ubuntu22.04"
65+
CUDA_VERSION="${IMAGE_TAG%%-*}" # "12.6.3"
66+
echo "Using CUDA version: ${CUDA_VERSION}"
67+
CUDA_VERSION_M_M="${CUDA_VERSION%.*}" # "12.6"
68+
CUDA_VERSION_MM="${CUDA_VERSION_M_M//./}" # "126"
69+
export UV_TORCH_BACKEND=cu${CUDA_VERSION_MM}
70+
71+
# Adjust tests
72+
uv pip install -q -r .actions/requirements.txt
73+
python .actions/assistant.py copy_replace_imports --source_dir="./tests" \
74+
--source_import="lightning.fabric,lightning.pytorch" \
75+
--target_import="lightning_fabric,pytorch_lightning"
76+
77+
# Install package
78+
uv pip install ".[dev]"
79+
80+
# Env details
81+
python requirements/collect_env_details.py
82+
python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu >= 2, f'GPU: {mgpu}'"
83+
84+
cd tests/
85+
# Testing: benchmarks
86+
export PL_RUNNING_BENCHMARKS=1
87+
python -m pytest parity_${PACKAGE_NAME} -v --durations=0
88+
export PL_RUNNING_BENCHMARKS=0
89+
90+
# Testing: fabric standalone tasks
91+
export PL_RUN_STANDALONE_TESTS=1
92+
if [ "${PACKAGE_NAME}" == "fabric" ]; then
93+
cd parity_fabric/
94+
bash run_standalone_tasks.sh cuda
95+
cd ..
96+
fi
97+
export PL_RUN_STANDALONE_TESTS=0
98+
99+
cd ..
100+
echo "Benchmarks completed successfully"

tests/parity_fabric/run_standalone_tasks.sh

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,10 @@ export PYTHONPATH="${PYTHONPATH}:$(pwd)"
1818
export PYTHONPATH="${PYTHONPATH}:$(pwd)/.."
1919

2020
MAX_RETRIES=3
21+
# parsing argument from call like `bash run_standalone_tasks.sh cuda`
22+
ACCELERATOR=$1
23+
# optional tolerance argument, default to 0.01
24+
TOLERANCE=${2:-0.01}
2125

2226
retry_command() {
2327
local command="$@"
@@ -39,5 +43,4 @@ retry_command() {
3943
return $exit_code
4044
}
4145

42-
retry_command "python -m test_parity_ddp --accelerator="cpu" --devices=2 --tolerance=0.02"
43-
retry_command "python -m test_parity_ddp --accelerator="cuda" --devices=2 --tolerance=0.01"
46+
retry_command "python -m test_parity_ddp --accelerator="$ACCELERATOR" --devices=2 --tolerance=$TOLERANCE"

0 commit comments

Comments
 (0)